mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/benchmarks/__init__.py +12 -0
- mcpbr/benchmarks/adversarial.py +341 -0
- mcpbr/benchmarks/custom.py +607 -0
- mcpbr/benchmarks/longbench.py +623 -0
- mcpbr/benchmarks/mmmu.py +353 -0
- mcpbr/config.py +4 -0
- mcpbr/config_migration.py +470 -0
- mcpbr/config_wizard.py +647 -0
- mcpbr/custom_metrics.py +405 -0
- mcpbr/dashboard.py +619 -0
- mcpbr/dataset_streaming.py +491 -0
- mcpbr/dataset_versioning.py +222 -0
- mcpbr/docker_cache.py +539 -0
- mcpbr/docker_prewarm.py +369 -0
- mcpbr/dry_run.py +532 -0
- mcpbr/failure_analysis.py +558 -0
- mcpbr/few_shot.py +367 -0
- mcpbr/formatting.py +444 -0
- mcpbr/gpu_support.py +157 -0
- mcpbr/harness.py +38 -4
- mcpbr/latency_metrics.py +317 -0
- mcpbr/resource_limits.py +487 -0
- mcpbr/result_streaming.py +519 -0
- mcpbr/sampling.py +193 -0
- mcpbr/task_batching.py +403 -0
- mcpbr/task_scheduler.py +468 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/few_shot.py
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""Few-shot learning support for benchmark evaluations.
|
|
2
|
+
|
|
3
|
+
Provides configurable few-shot example selection with multiple strategies
|
|
4
|
+
(random, similar, diverse), prompt formatting, and learning curve analysis
|
|
5
|
+
to study how performance changes with varying shot counts.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import random
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class FewShotConfig:
|
|
17
|
+
"""Configuration for few-shot example selection.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
num_shots: Number of examples to include (0 for zero-shot).
|
|
21
|
+
selection_strategy: Strategy for selecting examples. One of
|
|
22
|
+
``"random"``, ``"similar"``, or ``"diverse"``.
|
|
23
|
+
seed: Random seed for reproducibility. ``None`` for non-deterministic.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
num_shots: int = 0
|
|
27
|
+
selection_strategy: str = "random"
|
|
28
|
+
seed: int | None = None
|
|
29
|
+
|
|
30
|
+
def __post_init__(self) -> None:
|
|
31
|
+
"""Validate configuration values after initialisation."""
|
|
32
|
+
if self.num_shots < 0:
|
|
33
|
+
raise ValueError(f"num_shots must be non-negative, got {self.num_shots}")
|
|
34
|
+
|
|
35
|
+
valid_strategies = {"random", "similar", "diverse"}
|
|
36
|
+
if self.selection_strategy not in valid_strategies:
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"selection_strategy must be one of {valid_strategies}, "
|
|
39
|
+
f"got {self.selection_strategy!r}"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def select_examples(
|
|
44
|
+
pool: list[dict[str, Any]],
|
|
45
|
+
query: dict[str, Any],
|
|
46
|
+
config: FewShotConfig,
|
|
47
|
+
) -> list[dict[str, Any]]:
|
|
48
|
+
"""Select few-shot examples from a pool based on the given configuration.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
pool: List of candidate example dictionaries to select from.
|
|
52
|
+
query: The query/task dictionary that examples are being selected for.
|
|
53
|
+
Used by the ``"similar"`` strategy to find related examples.
|
|
54
|
+
config: Few-shot configuration controlling selection behaviour.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
List of selected example dictionaries. Length is
|
|
58
|
+
``min(config.num_shots, len(pool))``.
|
|
59
|
+
"""
|
|
60
|
+
if config.num_shots == 0 or not pool:
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
num = min(config.num_shots, len(pool))
|
|
64
|
+
|
|
65
|
+
if config.selection_strategy == "random":
|
|
66
|
+
return _select_random(pool, num, config.seed)
|
|
67
|
+
elif config.selection_strategy == "similar":
|
|
68
|
+
return _select_similar(pool, query, num, config.seed)
|
|
69
|
+
elif config.selection_strategy == "diverse":
|
|
70
|
+
return _select_diverse(pool, num, config.seed)
|
|
71
|
+
else:
|
|
72
|
+
raise ValueError(f"Unknown selection strategy: {config.selection_strategy!r}")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def format_few_shot_prompt(
|
|
76
|
+
examples: list[dict[str, Any]],
|
|
77
|
+
query: dict[str, Any],
|
|
78
|
+
template: str,
|
|
79
|
+
) -> str:
|
|
80
|
+
"""Format a prompt string with few-shot examples and the target query.
|
|
81
|
+
|
|
82
|
+
The *template* string should contain ``{examples}`` and ``{query}``
|
|
83
|
+
placeholders. Each example is formatted as a numbered block using the
|
|
84
|
+
example's string representation.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
examples: List of few-shot example dictionaries.
|
|
88
|
+
query: The target query dictionary.
|
|
89
|
+
template: A format string containing ``{examples}`` and ``{query}``
|
|
90
|
+
placeholders.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
The fully formatted prompt string.
|
|
94
|
+
"""
|
|
95
|
+
if not examples:
|
|
96
|
+
examples_text = "No examples provided."
|
|
97
|
+
else:
|
|
98
|
+
parts: list[str] = []
|
|
99
|
+
for i, example in enumerate(examples, 1):
|
|
100
|
+
parts.append(f"Example {i}:\n{_format_dict(example)}")
|
|
101
|
+
examples_text = "\n\n".join(parts)
|
|
102
|
+
|
|
103
|
+
query_text = _format_dict(query)
|
|
104
|
+
|
|
105
|
+
return template.format(examples=examples_text, query=query_text)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def compute_learning_curve(
|
|
109
|
+
results_by_shots: dict[int, list[dict[str, Any]]],
|
|
110
|
+
) -> dict[str, Any]:
|
|
111
|
+
"""Analyse how performance changes with the number of few-shot examples.
|
|
112
|
+
|
|
113
|
+
For each shot count, computes accuracy (fraction of results where
|
|
114
|
+
``"resolved"`` is truthy), average cost, and average token usage.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
results_by_shots: Mapping from shot count (int) to a list of result
|
|
118
|
+
dictionaries. Each result dict may contain keys such as
|
|
119
|
+
``"resolved"`` (bool), ``"cost"`` (float), and
|
|
120
|
+
``"tokens"`` (dict with ``"input"`` and ``"output"``).
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Dictionary with keys:
|
|
124
|
+
|
|
125
|
+
- ``"shot_counts"``: sorted list of shot counts.
|
|
126
|
+
- ``"accuracy"``: list of accuracy values corresponding to each shot count.
|
|
127
|
+
- ``"avg_cost"``: list of average cost values.
|
|
128
|
+
- ``"avg_tokens"``: list of average total token counts.
|
|
129
|
+
- ``"num_samples"``: list of sample counts for each shot count.
|
|
130
|
+
"""
|
|
131
|
+
if not results_by_shots:
|
|
132
|
+
return {
|
|
133
|
+
"shot_counts": [],
|
|
134
|
+
"accuracy": [],
|
|
135
|
+
"avg_cost": [],
|
|
136
|
+
"avg_tokens": [],
|
|
137
|
+
"num_samples": [],
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
sorted_shots = sorted(results_by_shots.keys())
|
|
141
|
+
accuracies: list[float] = []
|
|
142
|
+
avg_costs: list[float] = []
|
|
143
|
+
avg_tokens: list[float] = []
|
|
144
|
+
num_samples: list[int] = []
|
|
145
|
+
|
|
146
|
+
for shot_count in sorted_shots:
|
|
147
|
+
results = results_by_shots[shot_count]
|
|
148
|
+
n = len(results)
|
|
149
|
+
num_samples.append(n)
|
|
150
|
+
|
|
151
|
+
if n == 0:
|
|
152
|
+
accuracies.append(0.0)
|
|
153
|
+
avg_costs.append(0.0)
|
|
154
|
+
avg_tokens.append(0.0)
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
# Accuracy
|
|
158
|
+
resolved = sum(1 for r in results if r.get("resolved"))
|
|
159
|
+
accuracies.append(resolved / n)
|
|
160
|
+
|
|
161
|
+
# Average cost
|
|
162
|
+
total_cost = sum(r.get("cost", 0.0) for r in results)
|
|
163
|
+
avg_costs.append(total_cost / n)
|
|
164
|
+
|
|
165
|
+
# Average tokens
|
|
166
|
+
total_tokens = 0
|
|
167
|
+
for r in results:
|
|
168
|
+
tokens = r.get("tokens", {})
|
|
169
|
+
total_tokens += tokens.get("input", 0) + tokens.get("output", 0)
|
|
170
|
+
avg_tokens.append(total_tokens / n)
|
|
171
|
+
|
|
172
|
+
return {
|
|
173
|
+
"shot_counts": sorted_shots,
|
|
174
|
+
"accuracy": accuracies,
|
|
175
|
+
"avg_cost": avg_costs,
|
|
176
|
+
"avg_tokens": avg_tokens,
|
|
177
|
+
"num_samples": num_samples,
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# ---------------------------------------------------------------------------
|
|
182
|
+
# Internal helpers
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _format_dict(d: dict[str, Any]) -> str:
|
|
187
|
+
"""Format a dictionary as a human-readable key-value block.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
d: Dictionary to format.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Multi-line string with one ``key: value`` pair per line.
|
|
194
|
+
"""
|
|
195
|
+
lines = [f" {k}: {v}" for k, v in d.items()]
|
|
196
|
+
return "\n".join(lines) if lines else " (empty)"
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _select_random(
|
|
200
|
+
pool: list[dict[str, Any]],
|
|
201
|
+
num: int,
|
|
202
|
+
seed: int | None,
|
|
203
|
+
) -> list[dict[str, Any]]:
|
|
204
|
+
"""Randomly select *num* examples from *pool*.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
pool: Candidate examples.
|
|
208
|
+
num: Number of examples to select.
|
|
209
|
+
seed: Random seed for reproducibility.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Randomly selected examples.
|
|
213
|
+
"""
|
|
214
|
+
rng = random.Random(seed)
|
|
215
|
+
return rng.sample(pool, num)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _select_similar(
|
|
219
|
+
pool: list[dict[str, Any]],
|
|
220
|
+
query: dict[str, Any],
|
|
221
|
+
num: int,
|
|
222
|
+
seed: int | None,
|
|
223
|
+
) -> list[dict[str, Any]]:
|
|
224
|
+
"""Select examples most similar to *query* by shared category/tags.
|
|
225
|
+
|
|
226
|
+
Similarity is computed as the number of matching metadata values
|
|
227
|
+
for the keys ``"category"``, ``"tags"``, ``"language"``, and ``"difficulty"``.
|
|
228
|
+
For the ``"tags"`` key the score is the size of the intersection.
|
|
229
|
+
|
|
230
|
+
When there are ties in similarity score, a seeded RNG is used to
|
|
231
|
+
shuffle tied candidates for deterministic tie-breaking.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
pool: Candidate examples.
|
|
235
|
+
query: The query dictionary to compare against.
|
|
236
|
+
num: Number of examples to select.
|
|
237
|
+
seed: Random seed for reproducibility in tie-breaking.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Examples sorted by descending similarity, with ties broken
|
|
241
|
+
deterministically when *seed* is provided.
|
|
242
|
+
"""
|
|
243
|
+
rng = random.Random(seed)
|
|
244
|
+
|
|
245
|
+
scored: list[tuple[float, int, dict[str, Any]]] = []
|
|
246
|
+
for idx, example in enumerate(pool):
|
|
247
|
+
score = _similarity_score(example, query)
|
|
248
|
+
scored.append((score, idx, example))
|
|
249
|
+
|
|
250
|
+
# Group by score and shuffle within each group for deterministic tie-breaking
|
|
251
|
+
# Sort descending by score first
|
|
252
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
253
|
+
|
|
254
|
+
# Build groups of tied scores
|
|
255
|
+
groups: list[list[tuple[float, int, dict[str, Any]]]] = []
|
|
256
|
+
current_group: list[tuple[float, int, dict[str, Any]]] = []
|
|
257
|
+
current_score: float | None = None
|
|
258
|
+
|
|
259
|
+
for item in scored:
|
|
260
|
+
if current_score is None or item[0] == current_score:
|
|
261
|
+
current_group.append(item)
|
|
262
|
+
current_score = item[0]
|
|
263
|
+
else:
|
|
264
|
+
groups.append(current_group)
|
|
265
|
+
current_group = [item]
|
|
266
|
+
current_score = item[0]
|
|
267
|
+
if current_group:
|
|
268
|
+
groups.append(current_group)
|
|
269
|
+
|
|
270
|
+
# Shuffle within each group and flatten
|
|
271
|
+
result: list[dict[str, Any]] = []
|
|
272
|
+
for group in groups:
|
|
273
|
+
rng.shuffle(group)
|
|
274
|
+
for _, _, example in group:
|
|
275
|
+
result.append(example)
|
|
276
|
+
if len(result) == num:
|
|
277
|
+
return result
|
|
278
|
+
|
|
279
|
+
return result[:num]
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _similarity_score(example: dict[str, Any], query: dict[str, Any]) -> float:
|
|
283
|
+
"""Compute a simple similarity score between an example and a query.
|
|
284
|
+
|
|
285
|
+
Checks the following keys:
|
|
286
|
+
- ``"category"``: +1 if equal.
|
|
287
|
+
- ``"language"``: +1 if equal.
|
|
288
|
+
- ``"difficulty"``: +1 if equal.
|
|
289
|
+
- ``"tags"``: +1 for each shared tag (set intersection size).
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
example: Candidate example dictionary.
|
|
293
|
+
query: Target query dictionary.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
Non-negative similarity score.
|
|
297
|
+
"""
|
|
298
|
+
score = 0.0
|
|
299
|
+
|
|
300
|
+
for key in ("category", "language", "difficulty"):
|
|
301
|
+
if key in example and key in query and example[key] == query[key]:
|
|
302
|
+
score += 1.0
|
|
303
|
+
|
|
304
|
+
example_tags = set(example.get("tags", []))
|
|
305
|
+
query_tags = set(query.get("tags", []))
|
|
306
|
+
score += len(example_tags & query_tags)
|
|
307
|
+
|
|
308
|
+
return score
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _select_diverse(
|
|
312
|
+
pool: list[dict[str, Any]],
|
|
313
|
+
num: int,
|
|
314
|
+
seed: int | None,
|
|
315
|
+
) -> list[dict[str, Any]]:
|
|
316
|
+
"""Select diverse examples covering different categories.
|
|
317
|
+
|
|
318
|
+
Groups candidates by their ``"category"`` key (falling back to
|
|
319
|
+
``"_uncategorized_"``), then round-robins across categories to
|
|
320
|
+
select *num* examples. Within each category, examples are shuffled
|
|
321
|
+
using the provided *seed* for reproducibility.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
pool: Candidate examples.
|
|
325
|
+
num: Number of examples to select.
|
|
326
|
+
seed: Random seed for reproducibility.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Diverse selection of examples from different categories.
|
|
330
|
+
"""
|
|
331
|
+
rng = random.Random(seed)
|
|
332
|
+
|
|
333
|
+
# Group by category
|
|
334
|
+
categories: dict[str, list[dict[str, Any]]] = {}
|
|
335
|
+
for example in pool:
|
|
336
|
+
cat = example.get("category", "_uncategorized_")
|
|
337
|
+
if cat not in categories:
|
|
338
|
+
categories[cat] = []
|
|
339
|
+
categories[cat].append(example)
|
|
340
|
+
|
|
341
|
+
# Shuffle within each category for fairness
|
|
342
|
+
for cat_examples in categories.values():
|
|
343
|
+
rng.shuffle(cat_examples)
|
|
344
|
+
|
|
345
|
+
# Sort category keys for deterministic ordering
|
|
346
|
+
sorted_cats = sorted(categories.keys())
|
|
347
|
+
|
|
348
|
+
# Track current index within each category's shuffled list
|
|
349
|
+
cat_indices: dict[str, int] = {cat: 0 for cat in sorted_cats}
|
|
350
|
+
|
|
351
|
+
result: list[dict[str, Any]] = []
|
|
352
|
+
while len(result) < num:
|
|
353
|
+
added_any = False
|
|
354
|
+
for cat in sorted_cats:
|
|
355
|
+
if len(result) >= num:
|
|
356
|
+
break
|
|
357
|
+
idx = cat_indices[cat]
|
|
358
|
+
if idx < len(categories[cat]):
|
|
359
|
+
result.append(categories[cat][idx])
|
|
360
|
+
cat_indices[cat] = idx + 1
|
|
361
|
+
added_any = True
|
|
362
|
+
|
|
363
|
+
# If no category had remaining items, we've exhausted the pool
|
|
364
|
+
if not added_any:
|
|
365
|
+
break
|
|
366
|
+
|
|
367
|
+
return result
|