mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/benchmarks/__init__.py +12 -0
- mcpbr/benchmarks/adversarial.py +341 -0
- mcpbr/benchmarks/custom.py +607 -0
- mcpbr/benchmarks/longbench.py +623 -0
- mcpbr/benchmarks/mmmu.py +353 -0
- mcpbr/config.py +4 -0
- mcpbr/config_migration.py +470 -0
- mcpbr/config_wizard.py +647 -0
- mcpbr/custom_metrics.py +405 -0
- mcpbr/dashboard.py +619 -0
- mcpbr/dataset_streaming.py +491 -0
- mcpbr/dataset_versioning.py +222 -0
- mcpbr/docker_cache.py +539 -0
- mcpbr/docker_prewarm.py +369 -0
- mcpbr/dry_run.py +532 -0
- mcpbr/failure_analysis.py +558 -0
- mcpbr/few_shot.py +367 -0
- mcpbr/formatting.py +444 -0
- mcpbr/gpu_support.py +157 -0
- mcpbr/harness.py +38 -4
- mcpbr/latency_metrics.py +317 -0
- mcpbr/resource_limits.py +487 -0
- mcpbr/result_streaming.py +519 -0
- mcpbr/sampling.py +193 -0
- mcpbr/task_batching.py +403 -0
- mcpbr/task_scheduler.py +468 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/latency_metrics.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""Latency and performance benchmarking metrics for evaluation runs.
|
|
2
|
+
|
|
3
|
+
This module complements the PerformanceProfiler in profiler.py by providing
|
|
4
|
+
aggregate latency statistics across multiple evaluation tasks. While the profiler
|
|
5
|
+
tracks per-task performance, this module computes cross-task percentile distributions
|
|
6
|
+
and throughput metrics suitable for benchmarking reports.
|
|
7
|
+
|
|
8
|
+
Key capabilities:
|
|
9
|
+
- Per-task event timestamp tracking (start, first tool call, first response, end)
|
|
10
|
+
- Per-tool-call latency recording within each task
|
|
11
|
+
- Aggregate percentile statistics (p50, p95, p99, mean) across tasks
|
|
12
|
+
- Tokens-per-second throughput calculation
|
|
13
|
+
- Human-readable latency report formatting
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import statistics
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def percentile(data: list[float], p: float) -> float:
|
|
22
|
+
"""Calculate the p-th percentile of a list of values.
|
|
23
|
+
|
|
24
|
+
Uses linear interpolation between closest ranks for accurate percentile
|
|
25
|
+
estimation, falling back to boundary values for edge cases.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
data: List of numeric values. Must not be empty.
|
|
29
|
+
p: Percentile to compute, in range [0, 100].
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
The interpolated percentile value.
|
|
33
|
+
|
|
34
|
+
Raises:
|
|
35
|
+
ValueError: If data is empty or p is outside [0, 100].
|
|
36
|
+
"""
|
|
37
|
+
if not data:
|
|
38
|
+
raise ValueError("Cannot compute percentile of empty data")
|
|
39
|
+
if p < 0 or p > 100:
|
|
40
|
+
raise ValueError(f"Percentile must be between 0 and 100, got {p}")
|
|
41
|
+
|
|
42
|
+
sorted_data = sorted(data)
|
|
43
|
+
n = len(sorted_data)
|
|
44
|
+
|
|
45
|
+
if n == 1:
|
|
46
|
+
return sorted_data[0]
|
|
47
|
+
|
|
48
|
+
# Compute the rank using the C = 1 interpolation method (same as Excel PERCENTILE.INC)
|
|
49
|
+
rank = (p / 100) * (n - 1)
|
|
50
|
+
lower_index = int(rank)
|
|
51
|
+
upper_index = lower_index + 1
|
|
52
|
+
fraction = rank - lower_index
|
|
53
|
+
|
|
54
|
+
if upper_index >= n:
|
|
55
|
+
return sorted_data[-1]
|
|
56
|
+
|
|
57
|
+
return sorted_data[lower_index] + fraction * (
|
|
58
|
+
sorted_data[upper_index] - sorted_data[lower_index]
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class LatencyTracker:
|
|
64
|
+
"""Records timestamps for key events during a single evaluation task.
|
|
65
|
+
|
|
66
|
+
Tracks the lifecycle of a task from start to end, including when the first
|
|
67
|
+
tool call and first response occur. Also records individual tool call latencies
|
|
68
|
+
for fine-grained analysis.
|
|
69
|
+
|
|
70
|
+
Attributes:
|
|
71
|
+
task_id: Identifier for the task being tracked.
|
|
72
|
+
task_start: Timestamp (seconds since epoch) when the task began.
|
|
73
|
+
first_tool_call: Timestamp when the first tool call was initiated.
|
|
74
|
+
first_response: Timestamp when the first response was received.
|
|
75
|
+
task_end: Timestamp when the task completed.
|
|
76
|
+
tool_call_latencies: List of individual tool call durations in seconds.
|
|
77
|
+
total_tokens: Total tokens (input + output) consumed during the task.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
task_id: str = ""
|
|
81
|
+
task_start: float | None = None
|
|
82
|
+
first_tool_call: float | None = None
|
|
83
|
+
first_response: float | None = None
|
|
84
|
+
task_end: float | None = None
|
|
85
|
+
tool_call_latencies: list[float] = field(default_factory=list)
|
|
86
|
+
total_tokens: int = 0
|
|
87
|
+
|
|
88
|
+
def record_task_start(self, timestamp: float) -> None:
|
|
89
|
+
"""Record the task start timestamp.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
timestamp: Time in seconds (e.g., from time.time()).
|
|
93
|
+
"""
|
|
94
|
+
self.task_start = timestamp
|
|
95
|
+
|
|
96
|
+
def record_first_tool_call(self, timestamp: float) -> None:
|
|
97
|
+
"""Record the first tool call timestamp.
|
|
98
|
+
|
|
99
|
+
Only records the first occurrence; subsequent calls are ignored.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
timestamp: Time in seconds.
|
|
103
|
+
"""
|
|
104
|
+
if self.first_tool_call is None:
|
|
105
|
+
self.first_tool_call = timestamp
|
|
106
|
+
|
|
107
|
+
def record_first_response(self, timestamp: float) -> None:
|
|
108
|
+
"""Record the first response timestamp.
|
|
109
|
+
|
|
110
|
+
Only records the first occurrence; subsequent calls are ignored.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
timestamp: Time in seconds.
|
|
114
|
+
"""
|
|
115
|
+
if self.first_response is None:
|
|
116
|
+
self.first_response = timestamp
|
|
117
|
+
|
|
118
|
+
def record_task_end(self, timestamp: float) -> None:
|
|
119
|
+
"""Record the task end timestamp.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
timestamp: Time in seconds.
|
|
123
|
+
"""
|
|
124
|
+
self.task_end = timestamp
|
|
125
|
+
|
|
126
|
+
def record_tool_call_latency(self, duration_seconds: float) -> None:
|
|
127
|
+
"""Record the latency of an individual tool call.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
duration_seconds: Duration of the tool call in seconds.
|
|
131
|
+
"""
|
|
132
|
+
self.tool_call_latencies.append(duration_seconds)
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def time_to_first_tool_call(self) -> float | None:
|
|
136
|
+
"""Calculate time from task start to first tool call.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Duration in seconds, or None if either timestamp is missing.
|
|
140
|
+
"""
|
|
141
|
+
if self.task_start is not None and self.first_tool_call is not None:
|
|
142
|
+
return self.first_tool_call - self.task_start
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def total_task_duration(self) -> float | None:
|
|
147
|
+
"""Calculate total task duration from start to end.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Duration in seconds, or None if either timestamp is missing.
|
|
151
|
+
"""
|
|
152
|
+
if self.task_start is not None and self.task_end is not None:
|
|
153
|
+
return self.task_end - self.task_start
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def tokens_per_second(self) -> float | None:
|
|
158
|
+
"""Calculate throughput in tokens per second.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Tokens per second, or None if duration is zero or unavailable.
|
|
162
|
+
"""
|
|
163
|
+
duration = self.total_task_duration
|
|
164
|
+
if duration is not None and duration > 0 and self.total_tokens > 0:
|
|
165
|
+
return self.total_tokens / duration
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _compute_distribution(values: list[float]) -> dict[str, float]:
|
|
170
|
+
"""Compute percentile distribution and mean for a list of values.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
values: List of numeric values. Must not be empty.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Dictionary with keys: p50, p95, p99, mean.
|
|
177
|
+
"""
|
|
178
|
+
return {
|
|
179
|
+
"p50": percentile(values, 50),
|
|
180
|
+
"p95": percentile(values, 95),
|
|
181
|
+
"p99": percentile(values, 99),
|
|
182
|
+
"mean": statistics.mean(values),
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def compute_latency_stats(trackers: list["LatencyTracker"]) -> dict[str, Any]:
|
|
187
|
+
"""Compute aggregate latency statistics across multiple task trackers.
|
|
188
|
+
|
|
189
|
+
Collects timing data from all trackers and produces percentile distributions
|
|
190
|
+
for key metrics: time to first tool call, total task duration, individual
|
|
191
|
+
tool call latency, and tokens-per-second throughput.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
trackers: List of LatencyTracker instances with recorded data.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Dictionary containing:
|
|
198
|
+
- time_to_first_tool_call: {p50, p95, p99, mean} or None
|
|
199
|
+
- total_task_duration: {p50, p95, p99, mean} or None
|
|
200
|
+
- tool_call_latency: {p50, p95, p99, mean} or None
|
|
201
|
+
- tokens_per_second: {p50, p95, p99, mean} or None
|
|
202
|
+
- task_count: number of trackers analyzed
|
|
203
|
+
"""
|
|
204
|
+
if not trackers:
|
|
205
|
+
return {
|
|
206
|
+
"time_to_first_tool_call": None,
|
|
207
|
+
"total_task_duration": None,
|
|
208
|
+
"tool_call_latency": None,
|
|
209
|
+
"tokens_per_second": None,
|
|
210
|
+
"task_count": 0,
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
# Collect values from all trackers
|
|
214
|
+
ttftc_values: list[float] = []
|
|
215
|
+
duration_values: list[float] = []
|
|
216
|
+
tool_latency_values: list[float] = []
|
|
217
|
+
tps_values: list[float] = []
|
|
218
|
+
|
|
219
|
+
for tracker in trackers:
|
|
220
|
+
ttftc = tracker.time_to_first_tool_call
|
|
221
|
+
if ttftc is not None:
|
|
222
|
+
ttftc_values.append(ttftc)
|
|
223
|
+
|
|
224
|
+
duration = tracker.total_task_duration
|
|
225
|
+
if duration is not None:
|
|
226
|
+
duration_values.append(duration)
|
|
227
|
+
|
|
228
|
+
tool_latency_values.extend(tracker.tool_call_latencies)
|
|
229
|
+
|
|
230
|
+
tps = tracker.tokens_per_second
|
|
231
|
+
if tps is not None:
|
|
232
|
+
tps_values.append(tps)
|
|
233
|
+
|
|
234
|
+
return {
|
|
235
|
+
"time_to_first_tool_call": _compute_distribution(ttftc_values) if ttftc_values else None,
|
|
236
|
+
"total_task_duration": _compute_distribution(duration_values) if duration_values else None,
|
|
237
|
+
"tool_call_latency": (
|
|
238
|
+
_compute_distribution(tool_latency_values) if tool_latency_values else None
|
|
239
|
+
),
|
|
240
|
+
"tokens_per_second": _compute_distribution(tps_values) if tps_values else None,
|
|
241
|
+
"task_count": len(trackers),
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _format_distribution(label: str, dist: dict[str, float], unit: str = "s") -> str:
|
|
246
|
+
"""Format a single distribution as a human-readable line.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
label: Name of the metric.
|
|
250
|
+
dist: Distribution dict with p50, p95, p99, mean.
|
|
251
|
+
unit: Unit suffix to append to values.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
Formatted string line.
|
|
255
|
+
"""
|
|
256
|
+
return (
|
|
257
|
+
f" {label}:\n"
|
|
258
|
+
f" Mean: {dist['mean']:.3f}{unit}\n"
|
|
259
|
+
f" p50: {dist['p50']:.3f}{unit}\n"
|
|
260
|
+
f" p95: {dist['p95']:.3f}{unit}\n"
|
|
261
|
+
f" p99: {dist['p99']:.3f}{unit}"
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def format_latency_report(stats: dict[str, Any]) -> str:
|
|
266
|
+
"""Format latency statistics into a human-readable report.
|
|
267
|
+
|
|
268
|
+
Produces a multi-line text report suitable for console output or inclusion
|
|
269
|
+
in benchmark result files.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
stats: Statistics dictionary as returned by compute_latency_stats().
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Formatted multi-line report string.
|
|
276
|
+
"""
|
|
277
|
+
lines: list[str] = []
|
|
278
|
+
lines.append("=" * 50)
|
|
279
|
+
lines.append("Latency & Performance Report")
|
|
280
|
+
lines.append("=" * 50)
|
|
281
|
+
lines.append(f"Tasks analyzed: {stats.get('task_count', 0)}")
|
|
282
|
+
lines.append("")
|
|
283
|
+
|
|
284
|
+
ttftc = stats.get("time_to_first_tool_call")
|
|
285
|
+
if ttftc is not None:
|
|
286
|
+
lines.append(_format_distribution("Time to First Tool Call", ttftc))
|
|
287
|
+
lines.append("")
|
|
288
|
+
|
|
289
|
+
duration = stats.get("total_task_duration")
|
|
290
|
+
if duration is not None:
|
|
291
|
+
lines.append(_format_distribution("Total Task Duration", duration))
|
|
292
|
+
lines.append("")
|
|
293
|
+
|
|
294
|
+
tool_latency = stats.get("tool_call_latency")
|
|
295
|
+
if tool_latency is not None:
|
|
296
|
+
lines.append(_format_distribution("Tool Call Latency", tool_latency))
|
|
297
|
+
lines.append("")
|
|
298
|
+
|
|
299
|
+
tps = stats.get("tokens_per_second")
|
|
300
|
+
if tps is not None:
|
|
301
|
+
lines.append(_format_distribution("Throughput", tps, unit=" tok/s"))
|
|
302
|
+
lines.append("")
|
|
303
|
+
|
|
304
|
+
if all(
|
|
305
|
+
stats.get(key) is None
|
|
306
|
+
for key in [
|
|
307
|
+
"time_to_first_tool_call",
|
|
308
|
+
"total_task_duration",
|
|
309
|
+
"tool_call_latency",
|
|
310
|
+
"tokens_per_second",
|
|
311
|
+
]
|
|
312
|
+
):
|
|
313
|
+
lines.append(" No latency data available.")
|
|
314
|
+
lines.append("")
|
|
315
|
+
|
|
316
|
+
lines.append("=" * 50)
|
|
317
|
+
return "\n".join(lines)
|