mcpbr 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,317 @@
1
+ """Latency and performance benchmarking metrics for evaluation runs.
2
+
3
+ This module complements the PerformanceProfiler in profiler.py by providing
4
+ aggregate latency statistics across multiple evaluation tasks. While the profiler
5
+ tracks per-task performance, this module computes cross-task percentile distributions
6
+ and throughput metrics suitable for benchmarking reports.
7
+
8
+ Key capabilities:
9
+ - Per-task event timestamp tracking (start, first tool call, first response, end)
10
+ - Per-tool-call latency recording within each task
11
+ - Aggregate percentile statistics (p50, p95, p99, mean) across tasks
12
+ - Tokens-per-second throughput calculation
13
+ - Human-readable latency report formatting
14
+ """
15
+
16
+ import statistics
17
+ from dataclasses import dataclass, field
18
+ from typing import Any
19
+
20
+
21
+ def percentile(data: list[float], p: float) -> float:
22
+ """Calculate the p-th percentile of a list of values.
23
+
24
+ Uses linear interpolation between closest ranks for accurate percentile
25
+ estimation, falling back to boundary values for edge cases.
26
+
27
+ Args:
28
+ data: List of numeric values. Must not be empty.
29
+ p: Percentile to compute, in range [0, 100].
30
+
31
+ Returns:
32
+ The interpolated percentile value.
33
+
34
+ Raises:
35
+ ValueError: If data is empty or p is outside [0, 100].
36
+ """
37
+ if not data:
38
+ raise ValueError("Cannot compute percentile of empty data")
39
+ if p < 0 or p > 100:
40
+ raise ValueError(f"Percentile must be between 0 and 100, got {p}")
41
+
42
+ sorted_data = sorted(data)
43
+ n = len(sorted_data)
44
+
45
+ if n == 1:
46
+ return sorted_data[0]
47
+
48
+ # Compute the rank using the C = 1 interpolation method (same as Excel PERCENTILE.INC)
49
+ rank = (p / 100) * (n - 1)
50
+ lower_index = int(rank)
51
+ upper_index = lower_index + 1
52
+ fraction = rank - lower_index
53
+
54
+ if upper_index >= n:
55
+ return sorted_data[-1]
56
+
57
+ return sorted_data[lower_index] + fraction * (
58
+ sorted_data[upper_index] - sorted_data[lower_index]
59
+ )
60
+
61
+
62
+ @dataclass
63
+ class LatencyTracker:
64
+ """Records timestamps for key events during a single evaluation task.
65
+
66
+ Tracks the lifecycle of a task from start to end, including when the first
67
+ tool call and first response occur. Also records individual tool call latencies
68
+ for fine-grained analysis.
69
+
70
+ Attributes:
71
+ task_id: Identifier for the task being tracked.
72
+ task_start: Timestamp (seconds since epoch) when the task began.
73
+ first_tool_call: Timestamp when the first tool call was initiated.
74
+ first_response: Timestamp when the first response was received.
75
+ task_end: Timestamp when the task completed.
76
+ tool_call_latencies: List of individual tool call durations in seconds.
77
+ total_tokens: Total tokens (input + output) consumed during the task.
78
+ """
79
+
80
+ task_id: str = ""
81
+ task_start: float | None = None
82
+ first_tool_call: float | None = None
83
+ first_response: float | None = None
84
+ task_end: float | None = None
85
+ tool_call_latencies: list[float] = field(default_factory=list)
86
+ total_tokens: int = 0
87
+
88
+ def record_task_start(self, timestamp: float) -> None:
89
+ """Record the task start timestamp.
90
+
91
+ Args:
92
+ timestamp: Time in seconds (e.g., from time.time()).
93
+ """
94
+ self.task_start = timestamp
95
+
96
+ def record_first_tool_call(self, timestamp: float) -> None:
97
+ """Record the first tool call timestamp.
98
+
99
+ Only records the first occurrence; subsequent calls are ignored.
100
+
101
+ Args:
102
+ timestamp: Time in seconds.
103
+ """
104
+ if self.first_tool_call is None:
105
+ self.first_tool_call = timestamp
106
+
107
+ def record_first_response(self, timestamp: float) -> None:
108
+ """Record the first response timestamp.
109
+
110
+ Only records the first occurrence; subsequent calls are ignored.
111
+
112
+ Args:
113
+ timestamp: Time in seconds.
114
+ """
115
+ if self.first_response is None:
116
+ self.first_response = timestamp
117
+
118
+ def record_task_end(self, timestamp: float) -> None:
119
+ """Record the task end timestamp.
120
+
121
+ Args:
122
+ timestamp: Time in seconds.
123
+ """
124
+ self.task_end = timestamp
125
+
126
+ def record_tool_call_latency(self, duration_seconds: float) -> None:
127
+ """Record the latency of an individual tool call.
128
+
129
+ Args:
130
+ duration_seconds: Duration of the tool call in seconds.
131
+ """
132
+ self.tool_call_latencies.append(duration_seconds)
133
+
134
+ @property
135
+ def time_to_first_tool_call(self) -> float | None:
136
+ """Calculate time from task start to first tool call.
137
+
138
+ Returns:
139
+ Duration in seconds, or None if either timestamp is missing.
140
+ """
141
+ if self.task_start is not None and self.first_tool_call is not None:
142
+ return self.first_tool_call - self.task_start
143
+ return None
144
+
145
+ @property
146
+ def total_task_duration(self) -> float | None:
147
+ """Calculate total task duration from start to end.
148
+
149
+ Returns:
150
+ Duration in seconds, or None if either timestamp is missing.
151
+ """
152
+ if self.task_start is not None and self.task_end is not None:
153
+ return self.task_end - self.task_start
154
+ return None
155
+
156
+ @property
157
+ def tokens_per_second(self) -> float | None:
158
+ """Calculate throughput in tokens per second.
159
+
160
+ Returns:
161
+ Tokens per second, or None if duration is zero or unavailable.
162
+ """
163
+ duration = self.total_task_duration
164
+ if duration is not None and duration > 0 and self.total_tokens > 0:
165
+ return self.total_tokens / duration
166
+ return None
167
+
168
+
169
+ def _compute_distribution(values: list[float]) -> dict[str, float]:
170
+ """Compute percentile distribution and mean for a list of values.
171
+
172
+ Args:
173
+ values: List of numeric values. Must not be empty.
174
+
175
+ Returns:
176
+ Dictionary with keys: p50, p95, p99, mean.
177
+ """
178
+ return {
179
+ "p50": percentile(values, 50),
180
+ "p95": percentile(values, 95),
181
+ "p99": percentile(values, 99),
182
+ "mean": statistics.mean(values),
183
+ }
184
+
185
+
186
+ def compute_latency_stats(trackers: list["LatencyTracker"]) -> dict[str, Any]:
187
+ """Compute aggregate latency statistics across multiple task trackers.
188
+
189
+ Collects timing data from all trackers and produces percentile distributions
190
+ for key metrics: time to first tool call, total task duration, individual
191
+ tool call latency, and tokens-per-second throughput.
192
+
193
+ Args:
194
+ trackers: List of LatencyTracker instances with recorded data.
195
+
196
+ Returns:
197
+ Dictionary containing:
198
+ - time_to_first_tool_call: {p50, p95, p99, mean} or None
199
+ - total_task_duration: {p50, p95, p99, mean} or None
200
+ - tool_call_latency: {p50, p95, p99, mean} or None
201
+ - tokens_per_second: {p50, p95, p99, mean} or None
202
+ - task_count: number of trackers analyzed
203
+ """
204
+ if not trackers:
205
+ return {
206
+ "time_to_first_tool_call": None,
207
+ "total_task_duration": None,
208
+ "tool_call_latency": None,
209
+ "tokens_per_second": None,
210
+ "task_count": 0,
211
+ }
212
+
213
+ # Collect values from all trackers
214
+ ttftc_values: list[float] = []
215
+ duration_values: list[float] = []
216
+ tool_latency_values: list[float] = []
217
+ tps_values: list[float] = []
218
+
219
+ for tracker in trackers:
220
+ ttftc = tracker.time_to_first_tool_call
221
+ if ttftc is not None:
222
+ ttftc_values.append(ttftc)
223
+
224
+ duration = tracker.total_task_duration
225
+ if duration is not None:
226
+ duration_values.append(duration)
227
+
228
+ tool_latency_values.extend(tracker.tool_call_latencies)
229
+
230
+ tps = tracker.tokens_per_second
231
+ if tps is not None:
232
+ tps_values.append(tps)
233
+
234
+ return {
235
+ "time_to_first_tool_call": _compute_distribution(ttftc_values) if ttftc_values else None,
236
+ "total_task_duration": _compute_distribution(duration_values) if duration_values else None,
237
+ "tool_call_latency": (
238
+ _compute_distribution(tool_latency_values) if tool_latency_values else None
239
+ ),
240
+ "tokens_per_second": _compute_distribution(tps_values) if tps_values else None,
241
+ "task_count": len(trackers),
242
+ }
243
+
244
+
245
+ def _format_distribution(label: str, dist: dict[str, float], unit: str = "s") -> str:
246
+ """Format a single distribution as a human-readable line.
247
+
248
+ Args:
249
+ label: Name of the metric.
250
+ dist: Distribution dict with p50, p95, p99, mean.
251
+ unit: Unit suffix to append to values.
252
+
253
+ Returns:
254
+ Formatted string line.
255
+ """
256
+ return (
257
+ f" {label}:\n"
258
+ f" Mean: {dist['mean']:.3f}{unit}\n"
259
+ f" p50: {dist['p50']:.3f}{unit}\n"
260
+ f" p95: {dist['p95']:.3f}{unit}\n"
261
+ f" p99: {dist['p99']:.3f}{unit}"
262
+ )
263
+
264
+
265
+ def format_latency_report(stats: dict[str, Any]) -> str:
266
+ """Format latency statistics into a human-readable report.
267
+
268
+ Produces a multi-line text report suitable for console output or inclusion
269
+ in benchmark result files.
270
+
271
+ Args:
272
+ stats: Statistics dictionary as returned by compute_latency_stats().
273
+
274
+ Returns:
275
+ Formatted multi-line report string.
276
+ """
277
+ lines: list[str] = []
278
+ lines.append("=" * 50)
279
+ lines.append("Latency & Performance Report")
280
+ lines.append("=" * 50)
281
+ lines.append(f"Tasks analyzed: {stats.get('task_count', 0)}")
282
+ lines.append("")
283
+
284
+ ttftc = stats.get("time_to_first_tool_call")
285
+ if ttftc is not None:
286
+ lines.append(_format_distribution("Time to First Tool Call", ttftc))
287
+ lines.append("")
288
+
289
+ duration = stats.get("total_task_duration")
290
+ if duration is not None:
291
+ lines.append(_format_distribution("Total Task Duration", duration))
292
+ lines.append("")
293
+
294
+ tool_latency = stats.get("tool_call_latency")
295
+ if tool_latency is not None:
296
+ lines.append(_format_distribution("Tool Call Latency", tool_latency))
297
+ lines.append("")
298
+
299
+ tps = stats.get("tokens_per_second")
300
+ if tps is not None:
301
+ lines.append(_format_distribution("Throughput", tps, unit=" tok/s"))
302
+ lines.append("")
303
+
304
+ if all(
305
+ stats.get(key) is None
306
+ for key in [
307
+ "time_to_first_tool_call",
308
+ "total_task_duration",
309
+ "tool_call_latency",
310
+ "tokens_per_second",
311
+ ]
312
+ ):
313
+ lines.append(" No latency data available.")
314
+ lines.append("")
315
+
316
+ lines.append("=" * 50)
317
+ return "\n".join(lines)
mcpbr/sampling.py ADDED
@@ -0,0 +1,193 @@
1
+ """Sampling strategies for benchmark task selection.
2
+
3
+ Provides random and stratified sampling with seed control for reproducible
4
+ benchmark evaluations. Supports sequential (default), random, and stratified
5
+ sampling strategies.
6
+ """
7
+
8
+ import random
9
+ from collections import defaultdict
10
+ from enum import Enum
11
+ from typing import Any
12
+
13
+
14
+ class SamplingStrategy(Enum):
15
+ """Sampling strategy for selecting benchmark tasks.
16
+
17
+ Attributes:
18
+ SEQUENTIAL: Take the first N tasks (default behavior, backward compatible).
19
+ RANDOM: Randomly sample N tasks with optional seed for reproducibility.
20
+ STRATIFIED: Group tasks by a field, then sample proportionally from each group.
21
+ """
22
+
23
+ SEQUENTIAL = "sequential"
24
+ RANDOM = "random"
25
+ STRATIFIED = "stratified"
26
+
27
+
28
+ def sample_tasks(
29
+ tasks: list[dict[str, Any]],
30
+ sample_size: int | None = None,
31
+ strategy: SamplingStrategy = SamplingStrategy.SEQUENTIAL,
32
+ seed: int | None = None,
33
+ stratify_field: str | None = None,
34
+ ) -> list[dict[str, Any]]:
35
+ """Sample tasks from a list using the specified strategy.
36
+
37
+ Args:
38
+ tasks: Full list of task dictionaries to sample from.
39
+ sample_size: Number of tasks to select. None returns all tasks.
40
+ strategy: Sampling strategy to use.
41
+ seed: Random seed for reproducibility (used by RANDOM and STRATIFIED).
42
+ stratify_field: Field name to group by for STRATIFIED sampling.
43
+ Required when strategy is STRATIFIED.
44
+
45
+ Returns:
46
+ List of sampled task dictionaries.
47
+
48
+ Raises:
49
+ ValueError: If strategy is STRATIFIED but stratify_field is not provided.
50
+ ValueError: If strategy is STRATIFIED but stratify_field is not found in any task.
51
+ """
52
+ if not tasks:
53
+ return []
54
+
55
+ if sample_size is None or sample_size >= len(tasks):
56
+ return list(tasks)
57
+
58
+ if sample_size <= 0:
59
+ return []
60
+
61
+ if strategy == SamplingStrategy.SEQUENTIAL:
62
+ return _sample_sequential(tasks, sample_size)
63
+ elif strategy == SamplingStrategy.RANDOM:
64
+ return _sample_random(tasks, sample_size, seed)
65
+ elif strategy == SamplingStrategy.STRATIFIED:
66
+ return _sample_stratified(tasks, sample_size, seed, stratify_field)
67
+ else:
68
+ raise ValueError(f"Unknown sampling strategy: {strategy}")
69
+
70
+
71
+ def _sample_sequential(
72
+ tasks: list[dict[str, Any]],
73
+ sample_size: int,
74
+ ) -> list[dict[str, Any]]:
75
+ """Take the first N tasks sequentially.
76
+
77
+ This matches the existing behavior where tasks[:sample_size] is used.
78
+
79
+ Args:
80
+ tasks: Full list of tasks.
81
+ sample_size: Number of tasks to select.
82
+
83
+ Returns:
84
+ First sample_size tasks from the list.
85
+ """
86
+ return tasks[:sample_size]
87
+
88
+
89
+ def _sample_random(
90
+ tasks: list[dict[str, Any]],
91
+ sample_size: int,
92
+ seed: int | None = None,
93
+ ) -> list[dict[str, Any]]:
94
+ """Randomly sample N tasks with optional seed for reproducibility.
95
+
96
+ Args:
97
+ tasks: Full list of tasks.
98
+ sample_size: Number of tasks to select.
99
+ seed: Random seed for reproducibility.
100
+
101
+ Returns:
102
+ Randomly selected tasks.
103
+ """
104
+ rng = random.Random(seed)
105
+ return rng.sample(tasks, sample_size)
106
+
107
+
108
+ def _sample_stratified(
109
+ tasks: list[dict[str, Any]],
110
+ sample_size: int,
111
+ seed: int | None = None,
112
+ stratify_field: str | None = None,
113
+ ) -> list[dict[str, Any]]:
114
+ """Sample proportionally from groups defined by stratify_field.
115
+
116
+ Groups tasks by the value of stratify_field, then samples from each group
117
+ proportionally to its size in the original dataset. Uses round-robin allocation
118
+ for any remainder to ensure exact sample_size is met.
119
+
120
+ Args:
121
+ tasks: Full list of tasks.
122
+ sample_size: Total number of tasks to select across all groups.
123
+ seed: Random seed for reproducibility.
124
+ stratify_field: Field name to group tasks by.
125
+
126
+ Returns:
127
+ Stratified sample of tasks.
128
+
129
+ Raises:
130
+ ValueError: If stratify_field is None or not found in any task.
131
+ """
132
+ if not stratify_field:
133
+ raise ValueError("stratify_field is required when using STRATIFIED sampling strategy")
134
+
135
+ # Group tasks by the stratify_field value
136
+ groups: dict[str, list[dict[str, Any]]] = defaultdict(list)
137
+ for task in tasks:
138
+ key = str(task.get(stratify_field, "_unknown_"))
139
+ groups[key] = groups.get(key, [])
140
+ groups[key].append(task)
141
+
142
+ # Check that at least one task had the stratify_field
143
+ if len(groups) == 1 and "_unknown_" in groups:
144
+ raise ValueError(
145
+ f"stratify_field '{stratify_field}' not found in any task. "
146
+ f"Available fields: {list(tasks[0].keys()) if tasks else []}"
147
+ )
148
+
149
+ total_tasks = len(tasks)
150
+ rng = random.Random(seed)
151
+
152
+ # Sort group keys for deterministic ordering
153
+ sorted_keys = sorted(groups.keys())
154
+
155
+ # Calculate proportional allocation for each group
156
+ allocations: dict[str, int] = {}
157
+ allocated = 0
158
+ for key in sorted_keys:
159
+ group_size = len(groups[key])
160
+ # Proportional allocation (floor)
161
+ proportion = group_size / total_tasks
162
+ count = int(sample_size * proportion)
163
+ allocations[key] = count
164
+ allocated += count
165
+
166
+ # Distribute remainder using round-robin over groups sorted by fractional part
167
+ remainder = sample_size - allocated
168
+ if remainder > 0:
169
+ # Sort groups by their fractional allocation (descending) for fair distribution
170
+ fractional_parts = []
171
+ for key in sorted_keys:
172
+ group_size = len(groups[key])
173
+ proportion = group_size / total_tasks
174
+ exact = sample_size * proportion
175
+ fractional = exact - int(exact)
176
+ fractional_parts.append((fractional, key))
177
+
178
+ fractional_parts.sort(key=lambda x: x[0], reverse=True)
179
+
180
+ for i in range(remainder):
181
+ _, key = fractional_parts[i % len(fractional_parts)]
182
+ allocations[key] += 1
183
+
184
+ # Sample from each group
185
+ result: list[dict[str, Any]] = []
186
+ for key in sorted_keys:
187
+ group = groups[key]
188
+ count = min(allocations[key], len(group))
189
+ if count > 0:
190
+ sampled = rng.sample(group, count)
191
+ result.extend(sampled)
192
+
193
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcpbr
3
- Version: 0.4.14
3
+ Version: 0.4.16
4
4
  Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
5
5
  Project-URL: Homepage, https://github.com/greynewell/mcpbr
6
6
  Project-URL: Repository, https://github.com/greynewell/mcpbr
@@ -100,7 +100,7 @@ mcpbr runs controlled experiments: same model, same tasks, same environment - th
100
100
 
101
101
  ## Supported Benchmarks
102
102
 
103
- mcpbr supports 25+ benchmarks across 8 categories through a flexible abstraction layer:
103
+ mcpbr supports 30+ benchmarks across 10 categories through a flexible abstraction layer:
104
104
 
105
105
  | Category | Benchmarks |
106
106
  |----------|-----------|
@@ -111,7 +111,11 @@ mcpbr supports 25+ benchmarks across 8 categories through a flexible abstraction
111
111
  | **Tool Use & Agents** | [MCPToolBench++](https://greynewell.github.io/mcpbr/benchmarks/mcptoolbench/), [ToolBench](https://greynewell.github.io/mcpbr/benchmarks/toolbench/), [AgentBench](https://greynewell.github.io/mcpbr/benchmarks/agentbench/), [WebArena](https://greynewell.github.io/mcpbr/benchmarks/webarena/), [TerminalBench](https://greynewell.github.io/mcpbr/benchmarks/terminalbench/), [InterCode](https://greynewell.github.io/mcpbr/benchmarks/intercode/) |
112
112
  | **ML Research** | [MLAgentBench](https://greynewell.github.io/mcpbr/benchmarks/mlagentbench/) |
113
113
  | **Code Understanding** | [RepoQA](https://greynewell.github.io/mcpbr/benchmarks/repoqa/) |
114
+ | **Multimodal** | MMMU |
115
+ | **Long Context** | LongBench |
116
+ | **Safety & Adversarial** | Adversarial (HarmBench) |
114
117
  | **Security** | [CyberGym](https://greynewell.github.io/mcpbr/benchmarks/cybergym/) |
118
+ | **Custom** | User-defined benchmarks via YAML |
115
119
 
116
120
  ### Featured Benchmarks
117
121
 
@@ -1470,10 +1474,10 @@ We're building the defacto standard for MCP server benchmarking! Our [v1.0 Roadm
1470
1474
  - Cost analysis in reports
1471
1475
 
1472
1476
  **Phase 2: Benchmarks** (v0.4.0)
1473
- - HumanEval, MBPP, ToolBench
1474
- - GAIA for general AI capabilities
1475
- - Custom benchmark YAML support
1476
- - SWE-bench Verified
1477
+ - 30+ benchmarks across 10 categories
1478
+ - Custom benchmark YAML support
1479
+ - Custom metrics, failure analysis, sampling strategies
1480
+ - ✅ Dataset versioning, latency metrics, GPU support, few-shot learning
1477
1481
 
1478
1482
  **Phase 3: Developer Experience** (v0.5.0)
1479
1483
  - Real-time dashboard
@@ -3,16 +3,22 @@ mcpbr/__main__.py,sha256=WmeQsAqtW_9tMTNKArH1m76DPBokZpXuy6dMZp13gXA,132
3
3
  mcpbr/agent.py,sha256=aSFH2S3ExKZfdVfMbzk6D1nRhpKt4JmpRzmF4Vi6Gmo,5795
4
4
  mcpbr/cache.py,sha256=YiP13omwMbXLb6NhNocJvL58enXEx9J8OrvTZnWUkw4,13254
5
5
  mcpbr/cli.py,sha256=xvh7gpJx0LzjV3g-Te4FF7BfHubGzDxOiYQsSeQnCEc,68276
6
- mcpbr/config.py,sha256=E9Icedjk_VFONnnEZbWW5WN7El5RaJD5pGi-JQlrlV0,18890
6
+ mcpbr/config.py,sha256=7lWV0ZtzyD6WZ07IR4yhT9lyBBPONzlanaO4XHm9OoE,18952
7
7
  mcpbr/config_inheritance.py,sha256=0EV9Tv62UFNgZoc8mY7yYjHEbnMM_R5EAhSeuK7ajAA,6617
8
8
  mcpbr/config_validator.py,sha256=ZMEIeK4y6fSwyY46Xv5dK5v3jM4HDKcYkosnIcn7iyI,20488
9
- mcpbr/docker_env.py,sha256=vpbjL227L9qLjrS7CzXevxzo9393qmOrrxWG7lP1s44,31629
9
+ mcpbr/custom_metrics.py,sha256=4pMO9-BPpeQ_GUTnZ18TQXINFScAMH3cIYm0HG-C51o,13213
10
+ mcpbr/dataset_versioning.py,sha256=Y_ZSGhl8ihl6Kgee_p7VbkNwGhgwIdMZPlRunvk4knY,7149
11
+ mcpbr/docker_env.py,sha256=_45OUZKjUevE9O3YLF_1uvQtdOyJ7yZIYWmSvXN3cFw,31794
10
12
  mcpbr/env_expansion.py,sha256=Rkhth-tWV8CptQlSSk9exuMsUaSTTW9hj69z4snZd_U,6122
11
13
  mcpbr/evaluation.py,sha256=EjPREWv7hBRqhBhNan0ERh2imqMBegT0Y2cgZlTxRGk,12765
12
- mcpbr/harness.py,sha256=sEMP2PnrQP_BKK-4yixz05qXcY-0OsJNJ5e5JU2Rtsc,51079
14
+ mcpbr/failure_analysis.py,sha256=N5xp9YPe2d7P9fTa2LVSHsPgB1WOQtWMeClq3bOv4_c,19883
15
+ mcpbr/few_shot.py,sha256=bFDdes_kgZAFWoFZQEfZG5Z2Es9rmkB1jsxSMp4aCCM,11684
16
+ mcpbr/gpu_support.py,sha256=eroBiLkt1A3Q2ODJDSyqrd_BzcMh8tFkjtPn7PsvJJc,5070
17
+ mcpbr/harness.py,sha256=8-qmcPR2CDFuoBib9g6lPx7aMOK-5PuZgpWhpGs-Ils,51419
13
18
  mcpbr/harnesses.py,sha256=h9iDp4qkPABNwO9OXbJ61qcD4n0oAUTU7AQksxRKLcg,47335
14
19
  mcpbr/incremental_save.py,sha256=1dm3pGiEIhP8cVk_Y6XF_cAdo3B_vyRc6CO8Wt-MyIA,4830
15
20
  mcpbr/junit_reporter.py,sha256=M_02zJbFbA3VoIYG5oR7VDecqWHEpIee-JOUShWNuLU,9261
21
+ mcpbr/latency_metrics.py,sha256=xNMaUzGMSbOIfuoyZGyIfyMk5uAmoj6K65ZAs5D6Z8c,10476
16
22
  mcpbr/log_formatter.py,sha256=d2jWH7z4IRSbr8-PbnEt3TmLAqk8vgdPT38uTnTCN5c,21488
17
23
  mcpbr/models.py,sha256=zsrBrwFeOfNKgThUbT1oPkF5pdRjL1QJjMte0vXjcbk,3710
18
24
  mcpbr/output_validator.py,sha256=TUoBtDjjXvR6MACbWV6uNOsxM_n4C0Jbn5in35HH4K8,1750
@@ -22,6 +28,7 @@ mcpbr/profiler.py,sha256=SRXLKf2TOlpnMbQpGvjRy1Agv-XaEz6lDmBa5WGNv8c,15954
22
28
  mcpbr/providers.py,sha256=ebrnH6RXODxX4Ma9r7Is5VBHYFNP5LwCs-vpLbbHP8o,6598
23
29
  mcpbr/regression.py,sha256=xm_ago8ZP3RAOrDNjtINwyRUvzKWJcJDWbzf3hp6LlU,12827
24
30
  mcpbr/reporting.py,sha256=Odzb7EgpimW-qh01VQedhb2X594ACrOcGe4jshgiwTg,56111
31
+ mcpbr/sampling.py,sha256=Hpgh2TayI3QGcno-Np9eYi8sklxKEZQXyhpaQlc9T4Q,6248
25
32
  mcpbr/schema.py,sha256=fdjiKmp1au2oN5aXcPRoCbyvwm2XeMD5DmeWSurMk4A,6858
26
33
  mcpbr/smoke_test.py,sha256=srYGOn_auspRbt_a6ebYDDDq_nujA_iZGman5nU1ikU,14925
27
34
  mcpbr/state_tracker.py,sha256=rIP9LIHtQg6oBsLIxnwRjE865Kw6U7DMO_GzzuMRC0E,10790
@@ -29,7 +36,8 @@ mcpbr/statistics.py,sha256=Ny8TMdBrIpS4KfKCJcuFfTeaGuTmEkS1G_uHBlboYdA,19134
29
36
  mcpbr/streaming.py,sha256=XPhkXO1R1EsWtkoPvCpyy4TehEom7hkuOeP-00joX3o,13853
30
37
  mcpbr/swebench_test_specs.py,sha256=Mh_BPjcexkgDT3p4zT2p31925b8w5tgsxxRpYZQZalM,1390
31
38
  mcpbr/templates.py,sha256=dqwboVB-yfE06w2rgDOvuWJB4Hx5duH_W-jvLBqmlKg,10683
32
- mcpbr/benchmarks/__init__.py,sha256=RK0TxNTSqhUX_WtGs0CcV1MX2uiCBTUWkEHYpo_7T5M,4099
39
+ mcpbr/benchmarks/__init__.py,sha256=2-7Ebg6-wHo1QGfVKWjjbREcLG_A-6Q0XfZGiyXrOeE,4489
40
+ mcpbr/benchmarks/adversarial.py,sha256=69VBTZv6BhR1JwjQepA_YwAu3b--vJviGd6IWs2h1QA,12357
33
41
  mcpbr/benchmarks/agentbench.py,sha256=jQ8OG_5cn-PvOZizXivysLTw9xvtA8c_MWfw3jXq0TQ,6512
34
42
  mcpbr/benchmarks/aider_polyglot.py,sha256=_uWYNVaW0YWEWuuSXNxsqSngvWjo0HUeubcj16Q25uk,7256
35
43
  mcpbr/benchmarks/apps.py,sha256=mvN26KNICxGZh0sxCmxR0Ph6hfXnqRsVO-oB5I6MjgQ,7801
@@ -39,6 +47,7 @@ mcpbr/benchmarks/bigbench_hard.py,sha256=jwG5YV97xo6FiNnpAUseJVO_a_6QkpCYZ1r1mGi
39
47
  mcpbr/benchmarks/bigcodebench.py,sha256=dK4QkRTM6D1v3pprBgAxSTsOz7mJqi9f4sOfMKJUJXM,7117
40
48
  mcpbr/benchmarks/codecontests.py,sha256=Kx_izYR9D1sMcfVtslCN0upGsPtbXir7UHjL1fEZzc0,8905
41
49
  mcpbr/benchmarks/codereval.py,sha256=n77q2mXgMNg7wdeoMOSNKbLh86IrwG8iIzd64Gb0NEc,8341
50
+ mcpbr/benchmarks/custom.py,sha256=cjuhZLSyS4oCZun-3JJo3fsSVs-lcRv5kzaoQ_m2MTU,20675
42
51
  mcpbr/benchmarks/cybergym.py,sha256=r5itZNGdiDtztlC_BGLCdtLBZu0jgAyyG2_8cNUCoJ8,18574
43
52
  mcpbr/benchmarks/gaia.py,sha256=4Lxe6YAbKyIiPYgszvRcoia74TLZ6FqoIY5_337Vjtw,6852
44
53
  mcpbr/benchmarks/gsm8k.py,sha256=CK9C6qQi3rO81nuGcE-od2-PvQ48lmL-nQcLIeZDrbM,12730
@@ -46,10 +55,12 @@ mcpbr/benchmarks/hellaswag.py,sha256=Ah8Pub7QI94lgGHnbC6g3US4NTkt-zWSReS4h9Y6XGU
46
55
  mcpbr/benchmarks/humaneval.py,sha256=J9hCB17ppey81p4HS2ynGFsDDGLOdJhw63OSaG7vhT8,18296
47
56
  mcpbr/benchmarks/intercode.py,sha256=iq0X75aL469xIR8mVGUNaPlgdqAlySPsa2YWoSftw5M,8737
48
57
  mcpbr/benchmarks/leetcode.py,sha256=lan8A5D5Bfe5B6t_wx4KzZsAr9iNF7vch0Em2g9bX-k,7772
58
+ mcpbr/benchmarks/longbench.py,sha256=Hb4lGiojG3apRajgsI7c0DkcP1WzqdMrdpPEkI-WAkE,20791
49
59
  mcpbr/benchmarks/math_benchmark.py,sha256=LP_gjp3Cgzt1kDWVPqufRHg0YE0N9ouThOI6avpYxCk,8322
50
60
  mcpbr/benchmarks/mbpp.py,sha256=e1tgQJOEeAQAlkeYMBr4jymTYvC9s_Nt34TKExFVFy4,6907
51
61
  mcpbr/benchmarks/mcptoolbench.py,sha256=ioXPdXeXQEgBCHccOq7ier_-ucfQI41hUu0Z4HSIIAg,16209
52
62
  mcpbr/benchmarks/mlagentbench.py,sha256=Qr_BRhQFgK66KcEAr0svP44a-twWkXeTQVPQHdX7HpM,8367
63
+ mcpbr/benchmarks/mmmu.py,sha256=jvIgpM-ofJAkmuDKA0jMktDBsX41s0zyC8PRG5qSBlw,11929
53
64
  mcpbr/benchmarks/repoqa.py,sha256=0Z9WxXl2dFgSWLNRGFNGd2kOU_rItNrtSdF8ZbC2TqI,6509
54
65
  mcpbr/benchmarks/swebench.py,sha256=Eo4dL1BLabQqZvSLR9xqoDmEdy0Y0mLTgincbV78DjQ,6473
55
66
  mcpbr/benchmarks/terminalbench.py,sha256=I9YLeZh5j_AYvUJFhZkhlDTfIWU3OvcuJLjzYlfAZuw,7166
@@ -69,15 +80,15 @@ mcpbr/infrastructure/azure_health.py,sha256=xITmIa9IfYIwxcVhY0sJ81a-6WNKiT8kSQTd
69
80
  mcpbr/infrastructure/base.py,sha256=Olj6uiNBeGoUqltZI1NHZfa26kzT-6jfp8YIXSykFKM,3037
70
81
  mcpbr/infrastructure/local.py,sha256=VK6UAg7Dzvb9v1LAJgNGA_s0blQKrHAQEXBAC75zAL8,4237
71
82
  mcpbr/infrastructure/manager.py,sha256=j0T7U1Tbajmfve4SNfhYKikvL9kgSVT01fYKMC-sH-s,4796
72
- mcpbr-0.4.14.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
73
- mcpbr-0.4.14.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
74
- mcpbr-0.4.14.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
75
- mcpbr-0.4.14.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
76
- mcpbr-0.4.14.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
77
- mcpbr-0.4.14.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
78
- mcpbr-0.4.14.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
79
- mcpbr-0.4.14.dist-info/METADATA,sha256=f2PEinjR_XbBOmFtDAZxoDHdBLwKxLX4V9kjYqh_UtA,54809
80
- mcpbr-0.4.14.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
81
- mcpbr-0.4.14.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
82
- mcpbr-0.4.14.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
83
- mcpbr-0.4.14.dist-info/RECORD,,
83
+ mcpbr-0.4.16.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
84
+ mcpbr-0.4.16.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
85
+ mcpbr-0.4.16.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
86
+ mcpbr-0.4.16.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
87
+ mcpbr-0.4.16.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
88
+ mcpbr-0.4.16.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
89
+ mcpbr-0.4.16.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
90
+ mcpbr-0.4.16.dist-info/METADATA,sha256=GeSnMZw0x7-XPhblIu50aCO7NXaNfjgVScnBOp6ZaOA,55069
91
+ mcpbr-0.4.16.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
92
+ mcpbr-0.4.16.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
93
+ mcpbr-0.4.16.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
94
+ mcpbr-0.4.16.dist-info/RECORD,,
File without changes