mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/benchmarks/__init__.py +12 -0
  2. mcpbr/benchmarks/adversarial.py +341 -0
  3. mcpbr/benchmarks/custom.py +607 -0
  4. mcpbr/benchmarks/longbench.py +623 -0
  5. mcpbr/benchmarks/mmmu.py +353 -0
  6. mcpbr/config.py +4 -0
  7. mcpbr/config_migration.py +470 -0
  8. mcpbr/config_wizard.py +647 -0
  9. mcpbr/custom_metrics.py +405 -0
  10. mcpbr/dashboard.py +619 -0
  11. mcpbr/dataset_streaming.py +491 -0
  12. mcpbr/dataset_versioning.py +222 -0
  13. mcpbr/docker_cache.py +539 -0
  14. mcpbr/docker_prewarm.py +369 -0
  15. mcpbr/dry_run.py +532 -0
  16. mcpbr/failure_analysis.py +558 -0
  17. mcpbr/few_shot.py +367 -0
  18. mcpbr/formatting.py +444 -0
  19. mcpbr/gpu_support.py +157 -0
  20. mcpbr/harness.py +38 -4
  21. mcpbr/latency_metrics.py +317 -0
  22. mcpbr/resource_limits.py +487 -0
  23. mcpbr/result_streaming.py +519 -0
  24. mcpbr/sampling.py +193 -0
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
  28. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
  29. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,317 @@
1
+ """Latency and performance benchmarking metrics for evaluation runs.
2
+
3
+ This module complements the PerformanceProfiler in profiler.py by providing
4
+ aggregate latency statistics across multiple evaluation tasks. While the profiler
5
+ tracks per-task performance, this module computes cross-task percentile distributions
6
+ and throughput metrics suitable for benchmarking reports.
7
+
8
+ Key capabilities:
9
+ - Per-task event timestamp tracking (start, first tool call, first response, end)
10
+ - Per-tool-call latency recording within each task
11
+ - Aggregate percentile statistics (p50, p95, p99, mean) across tasks
12
+ - Tokens-per-second throughput calculation
13
+ - Human-readable latency report formatting
14
+ """
15
+
16
+ import statistics
17
+ from dataclasses import dataclass, field
18
+ from typing import Any
19
+
20
+
21
+ def percentile(data: list[float], p: float) -> float:
22
+ """Calculate the p-th percentile of a list of values.
23
+
24
+ Uses linear interpolation between closest ranks for accurate percentile
25
+ estimation, falling back to boundary values for edge cases.
26
+
27
+ Args:
28
+ data: List of numeric values. Must not be empty.
29
+ p: Percentile to compute, in range [0, 100].
30
+
31
+ Returns:
32
+ The interpolated percentile value.
33
+
34
+ Raises:
35
+ ValueError: If data is empty or p is outside [0, 100].
36
+ """
37
+ if not data:
38
+ raise ValueError("Cannot compute percentile of empty data")
39
+ if p < 0 or p > 100:
40
+ raise ValueError(f"Percentile must be between 0 and 100, got {p}")
41
+
42
+ sorted_data = sorted(data)
43
+ n = len(sorted_data)
44
+
45
+ if n == 1:
46
+ return sorted_data[0]
47
+
48
+ # Compute the rank using the C = 1 interpolation method (same as Excel PERCENTILE.INC)
49
+ rank = (p / 100) * (n - 1)
50
+ lower_index = int(rank)
51
+ upper_index = lower_index + 1
52
+ fraction = rank - lower_index
53
+
54
+ if upper_index >= n:
55
+ return sorted_data[-1]
56
+
57
+ return sorted_data[lower_index] + fraction * (
58
+ sorted_data[upper_index] - sorted_data[lower_index]
59
+ )
60
+
61
+
62
+ @dataclass
63
+ class LatencyTracker:
64
+ """Records timestamps for key events during a single evaluation task.
65
+
66
+ Tracks the lifecycle of a task from start to end, including when the first
67
+ tool call and first response occur. Also records individual tool call latencies
68
+ for fine-grained analysis.
69
+
70
+ Attributes:
71
+ task_id: Identifier for the task being tracked.
72
+ task_start: Timestamp (seconds since epoch) when the task began.
73
+ first_tool_call: Timestamp when the first tool call was initiated.
74
+ first_response: Timestamp when the first response was received.
75
+ task_end: Timestamp when the task completed.
76
+ tool_call_latencies: List of individual tool call durations in seconds.
77
+ total_tokens: Total tokens (input + output) consumed during the task.
78
+ """
79
+
80
+ task_id: str = ""
81
+ task_start: float | None = None
82
+ first_tool_call: float | None = None
83
+ first_response: float | None = None
84
+ task_end: float | None = None
85
+ tool_call_latencies: list[float] = field(default_factory=list)
86
+ total_tokens: int = 0
87
+
88
+ def record_task_start(self, timestamp: float) -> None:
89
+ """Record the task start timestamp.
90
+
91
+ Args:
92
+ timestamp: Time in seconds (e.g., from time.time()).
93
+ """
94
+ self.task_start = timestamp
95
+
96
+ def record_first_tool_call(self, timestamp: float) -> None:
97
+ """Record the first tool call timestamp.
98
+
99
+ Only records the first occurrence; subsequent calls are ignored.
100
+
101
+ Args:
102
+ timestamp: Time in seconds.
103
+ """
104
+ if self.first_tool_call is None:
105
+ self.first_tool_call = timestamp
106
+
107
+ def record_first_response(self, timestamp: float) -> None:
108
+ """Record the first response timestamp.
109
+
110
+ Only records the first occurrence; subsequent calls are ignored.
111
+
112
+ Args:
113
+ timestamp: Time in seconds.
114
+ """
115
+ if self.first_response is None:
116
+ self.first_response = timestamp
117
+
118
+ def record_task_end(self, timestamp: float) -> None:
119
+ """Record the task end timestamp.
120
+
121
+ Args:
122
+ timestamp: Time in seconds.
123
+ """
124
+ self.task_end = timestamp
125
+
126
+ def record_tool_call_latency(self, duration_seconds: float) -> None:
127
+ """Record the latency of an individual tool call.
128
+
129
+ Args:
130
+ duration_seconds: Duration of the tool call in seconds.
131
+ """
132
+ self.tool_call_latencies.append(duration_seconds)
133
+
134
+ @property
135
+ def time_to_first_tool_call(self) -> float | None:
136
+ """Calculate time from task start to first tool call.
137
+
138
+ Returns:
139
+ Duration in seconds, or None if either timestamp is missing.
140
+ """
141
+ if self.task_start is not None and self.first_tool_call is not None:
142
+ return self.first_tool_call - self.task_start
143
+ return None
144
+
145
+ @property
146
+ def total_task_duration(self) -> float | None:
147
+ """Calculate total task duration from start to end.
148
+
149
+ Returns:
150
+ Duration in seconds, or None if either timestamp is missing.
151
+ """
152
+ if self.task_start is not None and self.task_end is not None:
153
+ return self.task_end - self.task_start
154
+ return None
155
+
156
+ @property
157
+ def tokens_per_second(self) -> float | None:
158
+ """Calculate throughput in tokens per second.
159
+
160
+ Returns:
161
+ Tokens per second, or None if duration is zero or unavailable.
162
+ """
163
+ duration = self.total_task_duration
164
+ if duration is not None and duration > 0 and self.total_tokens > 0:
165
+ return self.total_tokens / duration
166
+ return None
167
+
168
+
169
+ def _compute_distribution(values: list[float]) -> dict[str, float]:
170
+ """Compute percentile distribution and mean for a list of values.
171
+
172
+ Args:
173
+ values: List of numeric values. Must not be empty.
174
+
175
+ Returns:
176
+ Dictionary with keys: p50, p95, p99, mean.
177
+ """
178
+ return {
179
+ "p50": percentile(values, 50),
180
+ "p95": percentile(values, 95),
181
+ "p99": percentile(values, 99),
182
+ "mean": statistics.mean(values),
183
+ }
184
+
185
+
186
+ def compute_latency_stats(trackers: list["LatencyTracker"]) -> dict[str, Any]:
187
+ """Compute aggregate latency statistics across multiple task trackers.
188
+
189
+ Collects timing data from all trackers and produces percentile distributions
190
+ for key metrics: time to first tool call, total task duration, individual
191
+ tool call latency, and tokens-per-second throughput.
192
+
193
+ Args:
194
+ trackers: List of LatencyTracker instances with recorded data.
195
+
196
+ Returns:
197
+ Dictionary containing:
198
+ - time_to_first_tool_call: {p50, p95, p99, mean} or None
199
+ - total_task_duration: {p50, p95, p99, mean} or None
200
+ - tool_call_latency: {p50, p95, p99, mean} or None
201
+ - tokens_per_second: {p50, p95, p99, mean} or None
202
+ - task_count: number of trackers analyzed
203
+ """
204
+ if not trackers:
205
+ return {
206
+ "time_to_first_tool_call": None,
207
+ "total_task_duration": None,
208
+ "tool_call_latency": None,
209
+ "tokens_per_second": None,
210
+ "task_count": 0,
211
+ }
212
+
213
+ # Collect values from all trackers
214
+ ttftc_values: list[float] = []
215
+ duration_values: list[float] = []
216
+ tool_latency_values: list[float] = []
217
+ tps_values: list[float] = []
218
+
219
+ for tracker in trackers:
220
+ ttftc = tracker.time_to_first_tool_call
221
+ if ttftc is not None:
222
+ ttftc_values.append(ttftc)
223
+
224
+ duration = tracker.total_task_duration
225
+ if duration is not None:
226
+ duration_values.append(duration)
227
+
228
+ tool_latency_values.extend(tracker.tool_call_latencies)
229
+
230
+ tps = tracker.tokens_per_second
231
+ if tps is not None:
232
+ tps_values.append(tps)
233
+
234
+ return {
235
+ "time_to_first_tool_call": _compute_distribution(ttftc_values) if ttftc_values else None,
236
+ "total_task_duration": _compute_distribution(duration_values) if duration_values else None,
237
+ "tool_call_latency": (
238
+ _compute_distribution(tool_latency_values) if tool_latency_values else None
239
+ ),
240
+ "tokens_per_second": _compute_distribution(tps_values) if tps_values else None,
241
+ "task_count": len(trackers),
242
+ }
243
+
244
+
245
+ def _format_distribution(label: str, dist: dict[str, float], unit: str = "s") -> str:
246
+ """Format a single distribution as a human-readable line.
247
+
248
+ Args:
249
+ label: Name of the metric.
250
+ dist: Distribution dict with p50, p95, p99, mean.
251
+ unit: Unit suffix to append to values.
252
+
253
+ Returns:
254
+ Formatted string line.
255
+ """
256
+ return (
257
+ f" {label}:\n"
258
+ f" Mean: {dist['mean']:.3f}{unit}\n"
259
+ f" p50: {dist['p50']:.3f}{unit}\n"
260
+ f" p95: {dist['p95']:.3f}{unit}\n"
261
+ f" p99: {dist['p99']:.3f}{unit}"
262
+ )
263
+
264
+
265
+ def format_latency_report(stats: dict[str, Any]) -> str:
266
+ """Format latency statistics into a human-readable report.
267
+
268
+ Produces a multi-line text report suitable for console output or inclusion
269
+ in benchmark result files.
270
+
271
+ Args:
272
+ stats: Statistics dictionary as returned by compute_latency_stats().
273
+
274
+ Returns:
275
+ Formatted multi-line report string.
276
+ """
277
+ lines: list[str] = []
278
+ lines.append("=" * 50)
279
+ lines.append("Latency & Performance Report")
280
+ lines.append("=" * 50)
281
+ lines.append(f"Tasks analyzed: {stats.get('task_count', 0)}")
282
+ lines.append("")
283
+
284
+ ttftc = stats.get("time_to_first_tool_call")
285
+ if ttftc is not None:
286
+ lines.append(_format_distribution("Time to First Tool Call", ttftc))
287
+ lines.append("")
288
+
289
+ duration = stats.get("total_task_duration")
290
+ if duration is not None:
291
+ lines.append(_format_distribution("Total Task Duration", duration))
292
+ lines.append("")
293
+
294
+ tool_latency = stats.get("tool_call_latency")
295
+ if tool_latency is not None:
296
+ lines.append(_format_distribution("Tool Call Latency", tool_latency))
297
+ lines.append("")
298
+
299
+ tps = stats.get("tokens_per_second")
300
+ if tps is not None:
301
+ lines.append(_format_distribution("Throughput", tps, unit=" tok/s"))
302
+ lines.append("")
303
+
304
+ if all(
305
+ stats.get(key) is None
306
+ for key in [
307
+ "time_to_first_tool_call",
308
+ "total_task_duration",
309
+ "tool_call_latency",
310
+ "tokens_per_second",
311
+ ]
312
+ ):
313
+ lines.append(" No latency data available.")
314
+ lines.append("")
315
+
316
+ lines.append("=" * 50)
317
+ return "\n".join(lines)