mcpbr 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/benchmarks/__init__.py +12 -0
- mcpbr/benchmarks/adversarial.py +341 -0
- mcpbr/benchmarks/custom.py +607 -0
- mcpbr/benchmarks/longbench.py +623 -0
- mcpbr/benchmarks/mmmu.py +353 -0
- mcpbr/config.py +4 -0
- mcpbr/custom_metrics.py +405 -0
- mcpbr/dataset_versioning.py +222 -0
- mcpbr/docker_env.py +6 -0
- mcpbr/failure_analysis.py +558 -0
- mcpbr/few_shot.py +367 -0
- mcpbr/gpu_support.py +157 -0
- mcpbr/harness.py +8 -0
- mcpbr/latency_metrics.py +317 -0
- mcpbr/sampling.py +193 -0
- {mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/METADATA +10 -6
- {mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/RECORD +27 -16
- {mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/WHEEL +0 -0
- {mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/licenses/LICENSE +0 -0
mcpbr/latency_metrics.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""Latency and performance benchmarking metrics for evaluation runs.
|
|
2
|
+
|
|
3
|
+
This module complements the PerformanceProfiler in profiler.py by providing
|
|
4
|
+
aggregate latency statistics across multiple evaluation tasks. While the profiler
|
|
5
|
+
tracks per-task performance, this module computes cross-task percentile distributions
|
|
6
|
+
and throughput metrics suitable for benchmarking reports.
|
|
7
|
+
|
|
8
|
+
Key capabilities:
|
|
9
|
+
- Per-task event timestamp tracking (start, first tool call, first response, end)
|
|
10
|
+
- Per-tool-call latency recording within each task
|
|
11
|
+
- Aggregate percentile statistics (p50, p95, p99, mean) across tasks
|
|
12
|
+
- Tokens-per-second throughput calculation
|
|
13
|
+
- Human-readable latency report formatting
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import statistics
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def percentile(data: list[float], p: float) -> float:
|
|
22
|
+
"""Calculate the p-th percentile of a list of values.
|
|
23
|
+
|
|
24
|
+
Uses linear interpolation between closest ranks for accurate percentile
|
|
25
|
+
estimation, falling back to boundary values for edge cases.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
data: List of numeric values. Must not be empty.
|
|
29
|
+
p: Percentile to compute, in range [0, 100].
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
The interpolated percentile value.
|
|
33
|
+
|
|
34
|
+
Raises:
|
|
35
|
+
ValueError: If data is empty or p is outside [0, 100].
|
|
36
|
+
"""
|
|
37
|
+
if not data:
|
|
38
|
+
raise ValueError("Cannot compute percentile of empty data")
|
|
39
|
+
if p < 0 or p > 100:
|
|
40
|
+
raise ValueError(f"Percentile must be between 0 and 100, got {p}")
|
|
41
|
+
|
|
42
|
+
sorted_data = sorted(data)
|
|
43
|
+
n = len(sorted_data)
|
|
44
|
+
|
|
45
|
+
if n == 1:
|
|
46
|
+
return sorted_data[0]
|
|
47
|
+
|
|
48
|
+
# Compute the rank using the C = 1 interpolation method (same as Excel PERCENTILE.INC)
|
|
49
|
+
rank = (p / 100) * (n - 1)
|
|
50
|
+
lower_index = int(rank)
|
|
51
|
+
upper_index = lower_index + 1
|
|
52
|
+
fraction = rank - lower_index
|
|
53
|
+
|
|
54
|
+
if upper_index >= n:
|
|
55
|
+
return sorted_data[-1]
|
|
56
|
+
|
|
57
|
+
return sorted_data[lower_index] + fraction * (
|
|
58
|
+
sorted_data[upper_index] - sorted_data[lower_index]
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class LatencyTracker:
|
|
64
|
+
"""Records timestamps for key events during a single evaluation task.
|
|
65
|
+
|
|
66
|
+
Tracks the lifecycle of a task from start to end, including when the first
|
|
67
|
+
tool call and first response occur. Also records individual tool call latencies
|
|
68
|
+
for fine-grained analysis.
|
|
69
|
+
|
|
70
|
+
Attributes:
|
|
71
|
+
task_id: Identifier for the task being tracked.
|
|
72
|
+
task_start: Timestamp (seconds since epoch) when the task began.
|
|
73
|
+
first_tool_call: Timestamp when the first tool call was initiated.
|
|
74
|
+
first_response: Timestamp when the first response was received.
|
|
75
|
+
task_end: Timestamp when the task completed.
|
|
76
|
+
tool_call_latencies: List of individual tool call durations in seconds.
|
|
77
|
+
total_tokens: Total tokens (input + output) consumed during the task.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
task_id: str = ""
|
|
81
|
+
task_start: float | None = None
|
|
82
|
+
first_tool_call: float | None = None
|
|
83
|
+
first_response: float | None = None
|
|
84
|
+
task_end: float | None = None
|
|
85
|
+
tool_call_latencies: list[float] = field(default_factory=list)
|
|
86
|
+
total_tokens: int = 0
|
|
87
|
+
|
|
88
|
+
def record_task_start(self, timestamp: float) -> None:
|
|
89
|
+
"""Record the task start timestamp.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
timestamp: Time in seconds (e.g., from time.time()).
|
|
93
|
+
"""
|
|
94
|
+
self.task_start = timestamp
|
|
95
|
+
|
|
96
|
+
def record_first_tool_call(self, timestamp: float) -> None:
|
|
97
|
+
"""Record the first tool call timestamp.
|
|
98
|
+
|
|
99
|
+
Only records the first occurrence; subsequent calls are ignored.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
timestamp: Time in seconds.
|
|
103
|
+
"""
|
|
104
|
+
if self.first_tool_call is None:
|
|
105
|
+
self.first_tool_call = timestamp
|
|
106
|
+
|
|
107
|
+
def record_first_response(self, timestamp: float) -> None:
|
|
108
|
+
"""Record the first response timestamp.
|
|
109
|
+
|
|
110
|
+
Only records the first occurrence; subsequent calls are ignored.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
timestamp: Time in seconds.
|
|
114
|
+
"""
|
|
115
|
+
if self.first_response is None:
|
|
116
|
+
self.first_response = timestamp
|
|
117
|
+
|
|
118
|
+
def record_task_end(self, timestamp: float) -> None:
|
|
119
|
+
"""Record the task end timestamp.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
timestamp: Time in seconds.
|
|
123
|
+
"""
|
|
124
|
+
self.task_end = timestamp
|
|
125
|
+
|
|
126
|
+
def record_tool_call_latency(self, duration_seconds: float) -> None:
|
|
127
|
+
"""Record the latency of an individual tool call.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
duration_seconds: Duration of the tool call in seconds.
|
|
131
|
+
"""
|
|
132
|
+
self.tool_call_latencies.append(duration_seconds)
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def time_to_first_tool_call(self) -> float | None:
|
|
136
|
+
"""Calculate time from task start to first tool call.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Duration in seconds, or None if either timestamp is missing.
|
|
140
|
+
"""
|
|
141
|
+
if self.task_start is not None and self.first_tool_call is not None:
|
|
142
|
+
return self.first_tool_call - self.task_start
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def total_task_duration(self) -> float | None:
|
|
147
|
+
"""Calculate total task duration from start to end.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Duration in seconds, or None if either timestamp is missing.
|
|
151
|
+
"""
|
|
152
|
+
if self.task_start is not None and self.task_end is not None:
|
|
153
|
+
return self.task_end - self.task_start
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def tokens_per_second(self) -> float | None:
|
|
158
|
+
"""Calculate throughput in tokens per second.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Tokens per second, or None if duration is zero or unavailable.
|
|
162
|
+
"""
|
|
163
|
+
duration = self.total_task_duration
|
|
164
|
+
if duration is not None and duration > 0 and self.total_tokens > 0:
|
|
165
|
+
return self.total_tokens / duration
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _compute_distribution(values: list[float]) -> dict[str, float]:
|
|
170
|
+
"""Compute percentile distribution and mean for a list of values.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
values: List of numeric values. Must not be empty.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Dictionary with keys: p50, p95, p99, mean.
|
|
177
|
+
"""
|
|
178
|
+
return {
|
|
179
|
+
"p50": percentile(values, 50),
|
|
180
|
+
"p95": percentile(values, 95),
|
|
181
|
+
"p99": percentile(values, 99),
|
|
182
|
+
"mean": statistics.mean(values),
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def compute_latency_stats(trackers: list["LatencyTracker"]) -> dict[str, Any]:
|
|
187
|
+
"""Compute aggregate latency statistics across multiple task trackers.
|
|
188
|
+
|
|
189
|
+
Collects timing data from all trackers and produces percentile distributions
|
|
190
|
+
for key metrics: time to first tool call, total task duration, individual
|
|
191
|
+
tool call latency, and tokens-per-second throughput.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
trackers: List of LatencyTracker instances with recorded data.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Dictionary containing:
|
|
198
|
+
- time_to_first_tool_call: {p50, p95, p99, mean} or None
|
|
199
|
+
- total_task_duration: {p50, p95, p99, mean} or None
|
|
200
|
+
- tool_call_latency: {p50, p95, p99, mean} or None
|
|
201
|
+
- tokens_per_second: {p50, p95, p99, mean} or None
|
|
202
|
+
- task_count: number of trackers analyzed
|
|
203
|
+
"""
|
|
204
|
+
if not trackers:
|
|
205
|
+
return {
|
|
206
|
+
"time_to_first_tool_call": None,
|
|
207
|
+
"total_task_duration": None,
|
|
208
|
+
"tool_call_latency": None,
|
|
209
|
+
"tokens_per_second": None,
|
|
210
|
+
"task_count": 0,
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
# Collect values from all trackers
|
|
214
|
+
ttftc_values: list[float] = []
|
|
215
|
+
duration_values: list[float] = []
|
|
216
|
+
tool_latency_values: list[float] = []
|
|
217
|
+
tps_values: list[float] = []
|
|
218
|
+
|
|
219
|
+
for tracker in trackers:
|
|
220
|
+
ttftc = tracker.time_to_first_tool_call
|
|
221
|
+
if ttftc is not None:
|
|
222
|
+
ttftc_values.append(ttftc)
|
|
223
|
+
|
|
224
|
+
duration = tracker.total_task_duration
|
|
225
|
+
if duration is not None:
|
|
226
|
+
duration_values.append(duration)
|
|
227
|
+
|
|
228
|
+
tool_latency_values.extend(tracker.tool_call_latencies)
|
|
229
|
+
|
|
230
|
+
tps = tracker.tokens_per_second
|
|
231
|
+
if tps is not None:
|
|
232
|
+
tps_values.append(tps)
|
|
233
|
+
|
|
234
|
+
return {
|
|
235
|
+
"time_to_first_tool_call": _compute_distribution(ttftc_values) if ttftc_values else None,
|
|
236
|
+
"total_task_duration": _compute_distribution(duration_values) if duration_values else None,
|
|
237
|
+
"tool_call_latency": (
|
|
238
|
+
_compute_distribution(tool_latency_values) if tool_latency_values else None
|
|
239
|
+
),
|
|
240
|
+
"tokens_per_second": _compute_distribution(tps_values) if tps_values else None,
|
|
241
|
+
"task_count": len(trackers),
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _format_distribution(label: str, dist: dict[str, float], unit: str = "s") -> str:
|
|
246
|
+
"""Format a single distribution as a human-readable line.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
label: Name of the metric.
|
|
250
|
+
dist: Distribution dict with p50, p95, p99, mean.
|
|
251
|
+
unit: Unit suffix to append to values.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
Formatted string line.
|
|
255
|
+
"""
|
|
256
|
+
return (
|
|
257
|
+
f" {label}:\n"
|
|
258
|
+
f" Mean: {dist['mean']:.3f}{unit}\n"
|
|
259
|
+
f" p50: {dist['p50']:.3f}{unit}\n"
|
|
260
|
+
f" p95: {dist['p95']:.3f}{unit}\n"
|
|
261
|
+
f" p99: {dist['p99']:.3f}{unit}"
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def format_latency_report(stats: dict[str, Any]) -> str:
|
|
266
|
+
"""Format latency statistics into a human-readable report.
|
|
267
|
+
|
|
268
|
+
Produces a multi-line text report suitable for console output or inclusion
|
|
269
|
+
in benchmark result files.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
stats: Statistics dictionary as returned by compute_latency_stats().
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Formatted multi-line report string.
|
|
276
|
+
"""
|
|
277
|
+
lines: list[str] = []
|
|
278
|
+
lines.append("=" * 50)
|
|
279
|
+
lines.append("Latency & Performance Report")
|
|
280
|
+
lines.append("=" * 50)
|
|
281
|
+
lines.append(f"Tasks analyzed: {stats.get('task_count', 0)}")
|
|
282
|
+
lines.append("")
|
|
283
|
+
|
|
284
|
+
ttftc = stats.get("time_to_first_tool_call")
|
|
285
|
+
if ttftc is not None:
|
|
286
|
+
lines.append(_format_distribution("Time to First Tool Call", ttftc))
|
|
287
|
+
lines.append("")
|
|
288
|
+
|
|
289
|
+
duration = stats.get("total_task_duration")
|
|
290
|
+
if duration is not None:
|
|
291
|
+
lines.append(_format_distribution("Total Task Duration", duration))
|
|
292
|
+
lines.append("")
|
|
293
|
+
|
|
294
|
+
tool_latency = stats.get("tool_call_latency")
|
|
295
|
+
if tool_latency is not None:
|
|
296
|
+
lines.append(_format_distribution("Tool Call Latency", tool_latency))
|
|
297
|
+
lines.append("")
|
|
298
|
+
|
|
299
|
+
tps = stats.get("tokens_per_second")
|
|
300
|
+
if tps is not None:
|
|
301
|
+
lines.append(_format_distribution("Throughput", tps, unit=" tok/s"))
|
|
302
|
+
lines.append("")
|
|
303
|
+
|
|
304
|
+
if all(
|
|
305
|
+
stats.get(key) is None
|
|
306
|
+
for key in [
|
|
307
|
+
"time_to_first_tool_call",
|
|
308
|
+
"total_task_duration",
|
|
309
|
+
"tool_call_latency",
|
|
310
|
+
"tokens_per_second",
|
|
311
|
+
]
|
|
312
|
+
):
|
|
313
|
+
lines.append(" No latency data available.")
|
|
314
|
+
lines.append("")
|
|
315
|
+
|
|
316
|
+
lines.append("=" * 50)
|
|
317
|
+
return "\n".join(lines)
|
mcpbr/sampling.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Sampling strategies for benchmark task selection.
|
|
2
|
+
|
|
3
|
+
Provides random and stratified sampling with seed control for reproducible
|
|
4
|
+
benchmark evaluations. Supports sequential (default), random, and stratified
|
|
5
|
+
sampling strategies.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import random
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SamplingStrategy(Enum):
|
|
15
|
+
"""Sampling strategy for selecting benchmark tasks.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
SEQUENTIAL: Take the first N tasks (default behavior, backward compatible).
|
|
19
|
+
RANDOM: Randomly sample N tasks with optional seed for reproducibility.
|
|
20
|
+
STRATIFIED: Group tasks by a field, then sample proportionally from each group.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
SEQUENTIAL = "sequential"
|
|
24
|
+
RANDOM = "random"
|
|
25
|
+
STRATIFIED = "stratified"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def sample_tasks(
|
|
29
|
+
tasks: list[dict[str, Any]],
|
|
30
|
+
sample_size: int | None = None,
|
|
31
|
+
strategy: SamplingStrategy = SamplingStrategy.SEQUENTIAL,
|
|
32
|
+
seed: int | None = None,
|
|
33
|
+
stratify_field: str | None = None,
|
|
34
|
+
) -> list[dict[str, Any]]:
|
|
35
|
+
"""Sample tasks from a list using the specified strategy.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
tasks: Full list of task dictionaries to sample from.
|
|
39
|
+
sample_size: Number of tasks to select. None returns all tasks.
|
|
40
|
+
strategy: Sampling strategy to use.
|
|
41
|
+
seed: Random seed for reproducibility (used by RANDOM and STRATIFIED).
|
|
42
|
+
stratify_field: Field name to group by for STRATIFIED sampling.
|
|
43
|
+
Required when strategy is STRATIFIED.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
List of sampled task dictionaries.
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
ValueError: If strategy is STRATIFIED but stratify_field is not provided.
|
|
50
|
+
ValueError: If strategy is STRATIFIED but stratify_field is not found in any task.
|
|
51
|
+
"""
|
|
52
|
+
if not tasks:
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
if sample_size is None or sample_size >= len(tasks):
|
|
56
|
+
return list(tasks)
|
|
57
|
+
|
|
58
|
+
if sample_size <= 0:
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
if strategy == SamplingStrategy.SEQUENTIAL:
|
|
62
|
+
return _sample_sequential(tasks, sample_size)
|
|
63
|
+
elif strategy == SamplingStrategy.RANDOM:
|
|
64
|
+
return _sample_random(tasks, sample_size, seed)
|
|
65
|
+
elif strategy == SamplingStrategy.STRATIFIED:
|
|
66
|
+
return _sample_stratified(tasks, sample_size, seed, stratify_field)
|
|
67
|
+
else:
|
|
68
|
+
raise ValueError(f"Unknown sampling strategy: {strategy}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _sample_sequential(
|
|
72
|
+
tasks: list[dict[str, Any]],
|
|
73
|
+
sample_size: int,
|
|
74
|
+
) -> list[dict[str, Any]]:
|
|
75
|
+
"""Take the first N tasks sequentially.
|
|
76
|
+
|
|
77
|
+
This matches the existing behavior where tasks[:sample_size] is used.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
tasks: Full list of tasks.
|
|
81
|
+
sample_size: Number of tasks to select.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
First sample_size tasks from the list.
|
|
85
|
+
"""
|
|
86
|
+
return tasks[:sample_size]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _sample_random(
|
|
90
|
+
tasks: list[dict[str, Any]],
|
|
91
|
+
sample_size: int,
|
|
92
|
+
seed: int | None = None,
|
|
93
|
+
) -> list[dict[str, Any]]:
|
|
94
|
+
"""Randomly sample N tasks with optional seed for reproducibility.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
tasks: Full list of tasks.
|
|
98
|
+
sample_size: Number of tasks to select.
|
|
99
|
+
seed: Random seed for reproducibility.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Randomly selected tasks.
|
|
103
|
+
"""
|
|
104
|
+
rng = random.Random(seed)
|
|
105
|
+
return rng.sample(tasks, sample_size)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _sample_stratified(
|
|
109
|
+
tasks: list[dict[str, Any]],
|
|
110
|
+
sample_size: int,
|
|
111
|
+
seed: int | None = None,
|
|
112
|
+
stratify_field: str | None = None,
|
|
113
|
+
) -> list[dict[str, Any]]:
|
|
114
|
+
"""Sample proportionally from groups defined by stratify_field.
|
|
115
|
+
|
|
116
|
+
Groups tasks by the value of stratify_field, then samples from each group
|
|
117
|
+
proportionally to its size in the original dataset. Uses round-robin allocation
|
|
118
|
+
for any remainder to ensure exact sample_size is met.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
tasks: Full list of tasks.
|
|
122
|
+
sample_size: Total number of tasks to select across all groups.
|
|
123
|
+
seed: Random seed for reproducibility.
|
|
124
|
+
stratify_field: Field name to group tasks by.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Stratified sample of tasks.
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
ValueError: If stratify_field is None or not found in any task.
|
|
131
|
+
"""
|
|
132
|
+
if not stratify_field:
|
|
133
|
+
raise ValueError("stratify_field is required when using STRATIFIED sampling strategy")
|
|
134
|
+
|
|
135
|
+
# Group tasks by the stratify_field value
|
|
136
|
+
groups: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
137
|
+
for task in tasks:
|
|
138
|
+
key = str(task.get(stratify_field, "_unknown_"))
|
|
139
|
+
groups[key] = groups.get(key, [])
|
|
140
|
+
groups[key].append(task)
|
|
141
|
+
|
|
142
|
+
# Check that at least one task had the stratify_field
|
|
143
|
+
if len(groups) == 1 and "_unknown_" in groups:
|
|
144
|
+
raise ValueError(
|
|
145
|
+
f"stratify_field '{stratify_field}' not found in any task. "
|
|
146
|
+
f"Available fields: {list(tasks[0].keys()) if tasks else []}"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
total_tasks = len(tasks)
|
|
150
|
+
rng = random.Random(seed)
|
|
151
|
+
|
|
152
|
+
# Sort group keys for deterministic ordering
|
|
153
|
+
sorted_keys = sorted(groups.keys())
|
|
154
|
+
|
|
155
|
+
# Calculate proportional allocation for each group
|
|
156
|
+
allocations: dict[str, int] = {}
|
|
157
|
+
allocated = 0
|
|
158
|
+
for key in sorted_keys:
|
|
159
|
+
group_size = len(groups[key])
|
|
160
|
+
# Proportional allocation (floor)
|
|
161
|
+
proportion = group_size / total_tasks
|
|
162
|
+
count = int(sample_size * proportion)
|
|
163
|
+
allocations[key] = count
|
|
164
|
+
allocated += count
|
|
165
|
+
|
|
166
|
+
# Distribute remainder using round-robin over groups sorted by fractional part
|
|
167
|
+
remainder = sample_size - allocated
|
|
168
|
+
if remainder > 0:
|
|
169
|
+
# Sort groups by their fractional allocation (descending) for fair distribution
|
|
170
|
+
fractional_parts = []
|
|
171
|
+
for key in sorted_keys:
|
|
172
|
+
group_size = len(groups[key])
|
|
173
|
+
proportion = group_size / total_tasks
|
|
174
|
+
exact = sample_size * proportion
|
|
175
|
+
fractional = exact - int(exact)
|
|
176
|
+
fractional_parts.append((fractional, key))
|
|
177
|
+
|
|
178
|
+
fractional_parts.sort(key=lambda x: x[0], reverse=True)
|
|
179
|
+
|
|
180
|
+
for i in range(remainder):
|
|
181
|
+
_, key = fractional_parts[i % len(fractional_parts)]
|
|
182
|
+
allocations[key] += 1
|
|
183
|
+
|
|
184
|
+
# Sample from each group
|
|
185
|
+
result: list[dict[str, Any]] = []
|
|
186
|
+
for key in sorted_keys:
|
|
187
|
+
group = groups[key]
|
|
188
|
+
count = min(allocations[key], len(group))
|
|
189
|
+
if count > 0:
|
|
190
|
+
sampled = rng.sample(group, count)
|
|
191
|
+
result.extend(sampled)
|
|
192
|
+
|
|
193
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcpbr
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.16
|
|
4
4
|
Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
|
|
5
5
|
Project-URL: Homepage, https://github.com/greynewell/mcpbr
|
|
6
6
|
Project-URL: Repository, https://github.com/greynewell/mcpbr
|
|
@@ -100,7 +100,7 @@ mcpbr runs controlled experiments: same model, same tasks, same environment - th
|
|
|
100
100
|
|
|
101
101
|
## Supported Benchmarks
|
|
102
102
|
|
|
103
|
-
mcpbr supports
|
|
103
|
+
mcpbr supports 30+ benchmarks across 10 categories through a flexible abstraction layer:
|
|
104
104
|
|
|
105
105
|
| Category | Benchmarks |
|
|
106
106
|
|----------|-----------|
|
|
@@ -111,7 +111,11 @@ mcpbr supports 25+ benchmarks across 8 categories through a flexible abstraction
|
|
|
111
111
|
| **Tool Use & Agents** | [MCPToolBench++](https://greynewell.github.io/mcpbr/benchmarks/mcptoolbench/), [ToolBench](https://greynewell.github.io/mcpbr/benchmarks/toolbench/), [AgentBench](https://greynewell.github.io/mcpbr/benchmarks/agentbench/), [WebArena](https://greynewell.github.io/mcpbr/benchmarks/webarena/), [TerminalBench](https://greynewell.github.io/mcpbr/benchmarks/terminalbench/), [InterCode](https://greynewell.github.io/mcpbr/benchmarks/intercode/) |
|
|
112
112
|
| **ML Research** | [MLAgentBench](https://greynewell.github.io/mcpbr/benchmarks/mlagentbench/) |
|
|
113
113
|
| **Code Understanding** | [RepoQA](https://greynewell.github.io/mcpbr/benchmarks/repoqa/) |
|
|
114
|
+
| **Multimodal** | MMMU |
|
|
115
|
+
| **Long Context** | LongBench |
|
|
116
|
+
| **Safety & Adversarial** | Adversarial (HarmBench) |
|
|
114
117
|
| **Security** | [CyberGym](https://greynewell.github.io/mcpbr/benchmarks/cybergym/) |
|
|
118
|
+
| **Custom** | User-defined benchmarks via YAML |
|
|
115
119
|
|
|
116
120
|
### Featured Benchmarks
|
|
117
121
|
|
|
@@ -1470,10 +1474,10 @@ We're building the defacto standard for MCP server benchmarking! Our [v1.0 Roadm
|
|
|
1470
1474
|
- Cost analysis in reports
|
|
1471
1475
|
|
|
1472
1476
|
**Phase 2: Benchmarks** (v0.4.0)
|
|
1473
|
-
-
|
|
1474
|
-
-
|
|
1475
|
-
- Custom
|
|
1476
|
-
-
|
|
1477
|
+
- ✅ 30+ benchmarks across 10 categories
|
|
1478
|
+
- ✅ Custom benchmark YAML support
|
|
1479
|
+
- ✅ Custom metrics, failure analysis, sampling strategies
|
|
1480
|
+
- ✅ Dataset versioning, latency metrics, GPU support, few-shot learning
|
|
1477
1481
|
|
|
1478
1482
|
**Phase 3: Developer Experience** (v0.5.0)
|
|
1479
1483
|
- Real-time dashboard
|
|
@@ -3,16 +3,22 @@ mcpbr/__main__.py,sha256=WmeQsAqtW_9tMTNKArH1m76DPBokZpXuy6dMZp13gXA,132
|
|
|
3
3
|
mcpbr/agent.py,sha256=aSFH2S3ExKZfdVfMbzk6D1nRhpKt4JmpRzmF4Vi6Gmo,5795
|
|
4
4
|
mcpbr/cache.py,sha256=YiP13omwMbXLb6NhNocJvL58enXEx9J8OrvTZnWUkw4,13254
|
|
5
5
|
mcpbr/cli.py,sha256=xvh7gpJx0LzjV3g-Te4FF7BfHubGzDxOiYQsSeQnCEc,68276
|
|
6
|
-
mcpbr/config.py,sha256=
|
|
6
|
+
mcpbr/config.py,sha256=7lWV0ZtzyD6WZ07IR4yhT9lyBBPONzlanaO4XHm9OoE,18952
|
|
7
7
|
mcpbr/config_inheritance.py,sha256=0EV9Tv62UFNgZoc8mY7yYjHEbnMM_R5EAhSeuK7ajAA,6617
|
|
8
8
|
mcpbr/config_validator.py,sha256=ZMEIeK4y6fSwyY46Xv5dK5v3jM4HDKcYkosnIcn7iyI,20488
|
|
9
|
-
mcpbr/
|
|
9
|
+
mcpbr/custom_metrics.py,sha256=4pMO9-BPpeQ_GUTnZ18TQXINFScAMH3cIYm0HG-C51o,13213
|
|
10
|
+
mcpbr/dataset_versioning.py,sha256=Y_ZSGhl8ihl6Kgee_p7VbkNwGhgwIdMZPlRunvk4knY,7149
|
|
11
|
+
mcpbr/docker_env.py,sha256=_45OUZKjUevE9O3YLF_1uvQtdOyJ7yZIYWmSvXN3cFw,31794
|
|
10
12
|
mcpbr/env_expansion.py,sha256=Rkhth-tWV8CptQlSSk9exuMsUaSTTW9hj69z4snZd_U,6122
|
|
11
13
|
mcpbr/evaluation.py,sha256=EjPREWv7hBRqhBhNan0ERh2imqMBegT0Y2cgZlTxRGk,12765
|
|
12
|
-
mcpbr/
|
|
14
|
+
mcpbr/failure_analysis.py,sha256=N5xp9YPe2d7P9fTa2LVSHsPgB1WOQtWMeClq3bOv4_c,19883
|
|
15
|
+
mcpbr/few_shot.py,sha256=bFDdes_kgZAFWoFZQEfZG5Z2Es9rmkB1jsxSMp4aCCM,11684
|
|
16
|
+
mcpbr/gpu_support.py,sha256=eroBiLkt1A3Q2ODJDSyqrd_BzcMh8tFkjtPn7PsvJJc,5070
|
|
17
|
+
mcpbr/harness.py,sha256=8-qmcPR2CDFuoBib9g6lPx7aMOK-5PuZgpWhpGs-Ils,51419
|
|
13
18
|
mcpbr/harnesses.py,sha256=h9iDp4qkPABNwO9OXbJ61qcD4n0oAUTU7AQksxRKLcg,47335
|
|
14
19
|
mcpbr/incremental_save.py,sha256=1dm3pGiEIhP8cVk_Y6XF_cAdo3B_vyRc6CO8Wt-MyIA,4830
|
|
15
20
|
mcpbr/junit_reporter.py,sha256=M_02zJbFbA3VoIYG5oR7VDecqWHEpIee-JOUShWNuLU,9261
|
|
21
|
+
mcpbr/latency_metrics.py,sha256=xNMaUzGMSbOIfuoyZGyIfyMk5uAmoj6K65ZAs5D6Z8c,10476
|
|
16
22
|
mcpbr/log_formatter.py,sha256=d2jWH7z4IRSbr8-PbnEt3TmLAqk8vgdPT38uTnTCN5c,21488
|
|
17
23
|
mcpbr/models.py,sha256=zsrBrwFeOfNKgThUbT1oPkF5pdRjL1QJjMte0vXjcbk,3710
|
|
18
24
|
mcpbr/output_validator.py,sha256=TUoBtDjjXvR6MACbWV6uNOsxM_n4C0Jbn5in35HH4K8,1750
|
|
@@ -22,6 +28,7 @@ mcpbr/profiler.py,sha256=SRXLKf2TOlpnMbQpGvjRy1Agv-XaEz6lDmBa5WGNv8c,15954
|
|
|
22
28
|
mcpbr/providers.py,sha256=ebrnH6RXODxX4Ma9r7Is5VBHYFNP5LwCs-vpLbbHP8o,6598
|
|
23
29
|
mcpbr/regression.py,sha256=xm_ago8ZP3RAOrDNjtINwyRUvzKWJcJDWbzf3hp6LlU,12827
|
|
24
30
|
mcpbr/reporting.py,sha256=Odzb7EgpimW-qh01VQedhb2X594ACrOcGe4jshgiwTg,56111
|
|
31
|
+
mcpbr/sampling.py,sha256=Hpgh2TayI3QGcno-Np9eYi8sklxKEZQXyhpaQlc9T4Q,6248
|
|
25
32
|
mcpbr/schema.py,sha256=fdjiKmp1au2oN5aXcPRoCbyvwm2XeMD5DmeWSurMk4A,6858
|
|
26
33
|
mcpbr/smoke_test.py,sha256=srYGOn_auspRbt_a6ebYDDDq_nujA_iZGman5nU1ikU,14925
|
|
27
34
|
mcpbr/state_tracker.py,sha256=rIP9LIHtQg6oBsLIxnwRjE865Kw6U7DMO_GzzuMRC0E,10790
|
|
@@ -29,7 +36,8 @@ mcpbr/statistics.py,sha256=Ny8TMdBrIpS4KfKCJcuFfTeaGuTmEkS1G_uHBlboYdA,19134
|
|
|
29
36
|
mcpbr/streaming.py,sha256=XPhkXO1R1EsWtkoPvCpyy4TehEom7hkuOeP-00joX3o,13853
|
|
30
37
|
mcpbr/swebench_test_specs.py,sha256=Mh_BPjcexkgDT3p4zT2p31925b8w5tgsxxRpYZQZalM,1390
|
|
31
38
|
mcpbr/templates.py,sha256=dqwboVB-yfE06w2rgDOvuWJB4Hx5duH_W-jvLBqmlKg,10683
|
|
32
|
-
mcpbr/benchmarks/__init__.py,sha256=
|
|
39
|
+
mcpbr/benchmarks/__init__.py,sha256=2-7Ebg6-wHo1QGfVKWjjbREcLG_A-6Q0XfZGiyXrOeE,4489
|
|
40
|
+
mcpbr/benchmarks/adversarial.py,sha256=69VBTZv6BhR1JwjQepA_YwAu3b--vJviGd6IWs2h1QA,12357
|
|
33
41
|
mcpbr/benchmarks/agentbench.py,sha256=jQ8OG_5cn-PvOZizXivysLTw9xvtA8c_MWfw3jXq0TQ,6512
|
|
34
42
|
mcpbr/benchmarks/aider_polyglot.py,sha256=_uWYNVaW0YWEWuuSXNxsqSngvWjo0HUeubcj16Q25uk,7256
|
|
35
43
|
mcpbr/benchmarks/apps.py,sha256=mvN26KNICxGZh0sxCmxR0Ph6hfXnqRsVO-oB5I6MjgQ,7801
|
|
@@ -39,6 +47,7 @@ mcpbr/benchmarks/bigbench_hard.py,sha256=jwG5YV97xo6FiNnpAUseJVO_a_6QkpCYZ1r1mGi
|
|
|
39
47
|
mcpbr/benchmarks/bigcodebench.py,sha256=dK4QkRTM6D1v3pprBgAxSTsOz7mJqi9f4sOfMKJUJXM,7117
|
|
40
48
|
mcpbr/benchmarks/codecontests.py,sha256=Kx_izYR9D1sMcfVtslCN0upGsPtbXir7UHjL1fEZzc0,8905
|
|
41
49
|
mcpbr/benchmarks/codereval.py,sha256=n77q2mXgMNg7wdeoMOSNKbLh86IrwG8iIzd64Gb0NEc,8341
|
|
50
|
+
mcpbr/benchmarks/custom.py,sha256=cjuhZLSyS4oCZun-3JJo3fsSVs-lcRv5kzaoQ_m2MTU,20675
|
|
42
51
|
mcpbr/benchmarks/cybergym.py,sha256=r5itZNGdiDtztlC_BGLCdtLBZu0jgAyyG2_8cNUCoJ8,18574
|
|
43
52
|
mcpbr/benchmarks/gaia.py,sha256=4Lxe6YAbKyIiPYgszvRcoia74TLZ6FqoIY5_337Vjtw,6852
|
|
44
53
|
mcpbr/benchmarks/gsm8k.py,sha256=CK9C6qQi3rO81nuGcE-od2-PvQ48lmL-nQcLIeZDrbM,12730
|
|
@@ -46,10 +55,12 @@ mcpbr/benchmarks/hellaswag.py,sha256=Ah8Pub7QI94lgGHnbC6g3US4NTkt-zWSReS4h9Y6XGU
|
|
|
46
55
|
mcpbr/benchmarks/humaneval.py,sha256=J9hCB17ppey81p4HS2ynGFsDDGLOdJhw63OSaG7vhT8,18296
|
|
47
56
|
mcpbr/benchmarks/intercode.py,sha256=iq0X75aL469xIR8mVGUNaPlgdqAlySPsa2YWoSftw5M,8737
|
|
48
57
|
mcpbr/benchmarks/leetcode.py,sha256=lan8A5D5Bfe5B6t_wx4KzZsAr9iNF7vch0Em2g9bX-k,7772
|
|
58
|
+
mcpbr/benchmarks/longbench.py,sha256=Hb4lGiojG3apRajgsI7c0DkcP1WzqdMrdpPEkI-WAkE,20791
|
|
49
59
|
mcpbr/benchmarks/math_benchmark.py,sha256=LP_gjp3Cgzt1kDWVPqufRHg0YE0N9ouThOI6avpYxCk,8322
|
|
50
60
|
mcpbr/benchmarks/mbpp.py,sha256=e1tgQJOEeAQAlkeYMBr4jymTYvC9s_Nt34TKExFVFy4,6907
|
|
51
61
|
mcpbr/benchmarks/mcptoolbench.py,sha256=ioXPdXeXQEgBCHccOq7ier_-ucfQI41hUu0Z4HSIIAg,16209
|
|
52
62
|
mcpbr/benchmarks/mlagentbench.py,sha256=Qr_BRhQFgK66KcEAr0svP44a-twWkXeTQVPQHdX7HpM,8367
|
|
63
|
+
mcpbr/benchmarks/mmmu.py,sha256=jvIgpM-ofJAkmuDKA0jMktDBsX41s0zyC8PRG5qSBlw,11929
|
|
53
64
|
mcpbr/benchmarks/repoqa.py,sha256=0Z9WxXl2dFgSWLNRGFNGd2kOU_rItNrtSdF8ZbC2TqI,6509
|
|
54
65
|
mcpbr/benchmarks/swebench.py,sha256=Eo4dL1BLabQqZvSLR9xqoDmEdy0Y0mLTgincbV78DjQ,6473
|
|
55
66
|
mcpbr/benchmarks/terminalbench.py,sha256=I9YLeZh5j_AYvUJFhZkhlDTfIWU3OvcuJLjzYlfAZuw,7166
|
|
@@ -69,15 +80,15 @@ mcpbr/infrastructure/azure_health.py,sha256=xITmIa9IfYIwxcVhY0sJ81a-6WNKiT8kSQTd
|
|
|
69
80
|
mcpbr/infrastructure/base.py,sha256=Olj6uiNBeGoUqltZI1NHZfa26kzT-6jfp8YIXSykFKM,3037
|
|
70
81
|
mcpbr/infrastructure/local.py,sha256=VK6UAg7Dzvb9v1LAJgNGA_s0blQKrHAQEXBAC75zAL8,4237
|
|
71
82
|
mcpbr/infrastructure/manager.py,sha256=j0T7U1Tbajmfve4SNfhYKikvL9kgSVT01fYKMC-sH-s,4796
|
|
72
|
-
mcpbr-0.4.
|
|
73
|
-
mcpbr-0.4.
|
|
74
|
-
mcpbr-0.4.
|
|
75
|
-
mcpbr-0.4.
|
|
76
|
-
mcpbr-0.4.
|
|
77
|
-
mcpbr-0.4.
|
|
78
|
-
mcpbr-0.4.
|
|
79
|
-
mcpbr-0.4.
|
|
80
|
-
mcpbr-0.4.
|
|
81
|
-
mcpbr-0.4.
|
|
82
|
-
mcpbr-0.4.
|
|
83
|
-
mcpbr-0.4.
|
|
83
|
+
mcpbr-0.4.16.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
|
|
84
|
+
mcpbr-0.4.16.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
|
|
85
|
+
mcpbr-0.4.16.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
|
|
86
|
+
mcpbr-0.4.16.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
|
|
87
|
+
mcpbr-0.4.16.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
|
|
88
|
+
mcpbr-0.4.16.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
|
|
89
|
+
mcpbr-0.4.16.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
|
|
90
|
+
mcpbr-0.4.16.dist-info/METADATA,sha256=GeSnMZw0x7-XPhblIu50aCO7NXaNfjgVScnBOp6ZaOA,55069
|
|
91
|
+
mcpbr-0.4.16.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
92
|
+
mcpbr-0.4.16.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
|
|
93
|
+
mcpbr-0.4.16.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
|
|
94
|
+
mcpbr-0.4.16.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|