mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/benchmarks/__init__.py +12 -0
  2. mcpbr/benchmarks/adversarial.py +341 -0
  3. mcpbr/benchmarks/custom.py +607 -0
  4. mcpbr/benchmarks/longbench.py +623 -0
  5. mcpbr/benchmarks/mmmu.py +353 -0
  6. mcpbr/config.py +4 -0
  7. mcpbr/config_migration.py +470 -0
  8. mcpbr/config_wizard.py +647 -0
  9. mcpbr/custom_metrics.py +405 -0
  10. mcpbr/dashboard.py +619 -0
  11. mcpbr/dataset_streaming.py +491 -0
  12. mcpbr/dataset_versioning.py +222 -0
  13. mcpbr/docker_cache.py +539 -0
  14. mcpbr/docker_prewarm.py +369 -0
  15. mcpbr/dry_run.py +532 -0
  16. mcpbr/failure_analysis.py +558 -0
  17. mcpbr/few_shot.py +367 -0
  18. mcpbr/formatting.py +444 -0
  19. mcpbr/gpu_support.py +157 -0
  20. mcpbr/harness.py +38 -4
  21. mcpbr/latency_metrics.py +317 -0
  22. mcpbr/resource_limits.py +487 -0
  23. mcpbr/result_streaming.py +519 -0
  24. mcpbr/sampling.py +193 -0
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
  28. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
  29. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/few_shot.py ADDED
@@ -0,0 +1,367 @@
1
+ """Few-shot learning support for benchmark evaluations.
2
+
3
+ Provides configurable few-shot example selection with multiple strategies
4
+ (random, similar, diverse), prompt formatting, and learning curve analysis
5
+ to study how performance changes with varying shot counts.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import random
11
+ from dataclasses import dataclass
12
+ from typing import Any
13
+
14
+
15
+ @dataclass
16
+ class FewShotConfig:
17
+ """Configuration for few-shot example selection.
18
+
19
+ Attributes:
20
+ num_shots: Number of examples to include (0 for zero-shot).
21
+ selection_strategy: Strategy for selecting examples. One of
22
+ ``"random"``, ``"similar"``, or ``"diverse"``.
23
+ seed: Random seed for reproducibility. ``None`` for non-deterministic.
24
+ """
25
+
26
+ num_shots: int = 0
27
+ selection_strategy: str = "random"
28
+ seed: int | None = None
29
+
30
+ def __post_init__(self) -> None:
31
+ """Validate configuration values after initialisation."""
32
+ if self.num_shots < 0:
33
+ raise ValueError(f"num_shots must be non-negative, got {self.num_shots}")
34
+
35
+ valid_strategies = {"random", "similar", "diverse"}
36
+ if self.selection_strategy not in valid_strategies:
37
+ raise ValueError(
38
+ f"selection_strategy must be one of {valid_strategies}, "
39
+ f"got {self.selection_strategy!r}"
40
+ )
41
+
42
+
43
+ def select_examples(
44
+ pool: list[dict[str, Any]],
45
+ query: dict[str, Any],
46
+ config: FewShotConfig,
47
+ ) -> list[dict[str, Any]]:
48
+ """Select few-shot examples from a pool based on the given configuration.
49
+
50
+ Args:
51
+ pool: List of candidate example dictionaries to select from.
52
+ query: The query/task dictionary that examples are being selected for.
53
+ Used by the ``"similar"`` strategy to find related examples.
54
+ config: Few-shot configuration controlling selection behaviour.
55
+
56
+ Returns:
57
+ List of selected example dictionaries. Length is
58
+ ``min(config.num_shots, len(pool))``.
59
+ """
60
+ if config.num_shots == 0 or not pool:
61
+ return []
62
+
63
+ num = min(config.num_shots, len(pool))
64
+
65
+ if config.selection_strategy == "random":
66
+ return _select_random(pool, num, config.seed)
67
+ elif config.selection_strategy == "similar":
68
+ return _select_similar(pool, query, num, config.seed)
69
+ elif config.selection_strategy == "diverse":
70
+ return _select_diverse(pool, num, config.seed)
71
+ else:
72
+ raise ValueError(f"Unknown selection strategy: {config.selection_strategy!r}")
73
+
74
+
75
+ def format_few_shot_prompt(
76
+ examples: list[dict[str, Any]],
77
+ query: dict[str, Any],
78
+ template: str,
79
+ ) -> str:
80
+ """Format a prompt string with few-shot examples and the target query.
81
+
82
+ The *template* string should contain ``{examples}`` and ``{query}``
83
+ placeholders. Each example is formatted as a numbered block using the
84
+ example's string representation.
85
+
86
+ Args:
87
+ examples: List of few-shot example dictionaries.
88
+ query: The target query dictionary.
89
+ template: A format string containing ``{examples}`` and ``{query}``
90
+ placeholders.
91
+
92
+ Returns:
93
+ The fully formatted prompt string.
94
+ """
95
+ if not examples:
96
+ examples_text = "No examples provided."
97
+ else:
98
+ parts: list[str] = []
99
+ for i, example in enumerate(examples, 1):
100
+ parts.append(f"Example {i}:\n{_format_dict(example)}")
101
+ examples_text = "\n\n".join(parts)
102
+
103
+ query_text = _format_dict(query)
104
+
105
+ return template.format(examples=examples_text, query=query_text)
106
+
107
+
108
+ def compute_learning_curve(
109
+ results_by_shots: dict[int, list[dict[str, Any]]],
110
+ ) -> dict[str, Any]:
111
+ """Analyse how performance changes with the number of few-shot examples.
112
+
113
+ For each shot count, computes accuracy (fraction of results where
114
+ ``"resolved"`` is truthy), average cost, and average token usage.
115
+
116
+ Args:
117
+ results_by_shots: Mapping from shot count (int) to a list of result
118
+ dictionaries. Each result dict may contain keys such as
119
+ ``"resolved"`` (bool), ``"cost"`` (float), and
120
+ ``"tokens"`` (dict with ``"input"`` and ``"output"``).
121
+
122
+ Returns:
123
+ Dictionary with keys:
124
+
125
+ - ``"shot_counts"``: sorted list of shot counts.
126
+ - ``"accuracy"``: list of accuracy values corresponding to each shot count.
127
+ - ``"avg_cost"``: list of average cost values.
128
+ - ``"avg_tokens"``: list of average total token counts.
129
+ - ``"num_samples"``: list of sample counts for each shot count.
130
+ """
131
+ if not results_by_shots:
132
+ return {
133
+ "shot_counts": [],
134
+ "accuracy": [],
135
+ "avg_cost": [],
136
+ "avg_tokens": [],
137
+ "num_samples": [],
138
+ }
139
+
140
+ sorted_shots = sorted(results_by_shots.keys())
141
+ accuracies: list[float] = []
142
+ avg_costs: list[float] = []
143
+ avg_tokens: list[float] = []
144
+ num_samples: list[int] = []
145
+
146
+ for shot_count in sorted_shots:
147
+ results = results_by_shots[shot_count]
148
+ n = len(results)
149
+ num_samples.append(n)
150
+
151
+ if n == 0:
152
+ accuracies.append(0.0)
153
+ avg_costs.append(0.0)
154
+ avg_tokens.append(0.0)
155
+ continue
156
+
157
+ # Accuracy
158
+ resolved = sum(1 for r in results if r.get("resolved"))
159
+ accuracies.append(resolved / n)
160
+
161
+ # Average cost
162
+ total_cost = sum(r.get("cost", 0.0) for r in results)
163
+ avg_costs.append(total_cost / n)
164
+
165
+ # Average tokens
166
+ total_tokens = 0
167
+ for r in results:
168
+ tokens = r.get("tokens", {})
169
+ total_tokens += tokens.get("input", 0) + tokens.get("output", 0)
170
+ avg_tokens.append(total_tokens / n)
171
+
172
+ return {
173
+ "shot_counts": sorted_shots,
174
+ "accuracy": accuracies,
175
+ "avg_cost": avg_costs,
176
+ "avg_tokens": avg_tokens,
177
+ "num_samples": num_samples,
178
+ }
179
+
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # Internal helpers
183
+ # ---------------------------------------------------------------------------
184
+
185
+
186
+ def _format_dict(d: dict[str, Any]) -> str:
187
+ """Format a dictionary as a human-readable key-value block.
188
+
189
+ Args:
190
+ d: Dictionary to format.
191
+
192
+ Returns:
193
+ Multi-line string with one ``key: value`` pair per line.
194
+ """
195
+ lines = [f" {k}: {v}" for k, v in d.items()]
196
+ return "\n".join(lines) if lines else " (empty)"
197
+
198
+
199
+ def _select_random(
200
+ pool: list[dict[str, Any]],
201
+ num: int,
202
+ seed: int | None,
203
+ ) -> list[dict[str, Any]]:
204
+ """Randomly select *num* examples from *pool*.
205
+
206
+ Args:
207
+ pool: Candidate examples.
208
+ num: Number of examples to select.
209
+ seed: Random seed for reproducibility.
210
+
211
+ Returns:
212
+ Randomly selected examples.
213
+ """
214
+ rng = random.Random(seed)
215
+ return rng.sample(pool, num)
216
+
217
+
218
+ def _select_similar(
219
+ pool: list[dict[str, Any]],
220
+ query: dict[str, Any],
221
+ num: int,
222
+ seed: int | None,
223
+ ) -> list[dict[str, Any]]:
224
+ """Select examples most similar to *query* by shared category/tags.
225
+
226
+ Similarity is computed as the number of matching metadata values
227
+ for the keys ``"category"``, ``"tags"``, ``"language"``, and ``"difficulty"``.
228
+ For the ``"tags"`` key the score is the size of the intersection.
229
+
230
+ When there are ties in similarity score, a seeded RNG is used to
231
+ shuffle tied candidates for deterministic tie-breaking.
232
+
233
+ Args:
234
+ pool: Candidate examples.
235
+ query: The query dictionary to compare against.
236
+ num: Number of examples to select.
237
+ seed: Random seed for reproducibility in tie-breaking.
238
+
239
+ Returns:
240
+ Examples sorted by descending similarity, with ties broken
241
+ deterministically when *seed* is provided.
242
+ """
243
+ rng = random.Random(seed)
244
+
245
+ scored: list[tuple[float, int, dict[str, Any]]] = []
246
+ for idx, example in enumerate(pool):
247
+ score = _similarity_score(example, query)
248
+ scored.append((score, idx, example))
249
+
250
+ # Group by score and shuffle within each group for deterministic tie-breaking
251
+ # Sort descending by score first
252
+ scored.sort(key=lambda x: x[0], reverse=True)
253
+
254
+ # Build groups of tied scores
255
+ groups: list[list[tuple[float, int, dict[str, Any]]]] = []
256
+ current_group: list[tuple[float, int, dict[str, Any]]] = []
257
+ current_score: float | None = None
258
+
259
+ for item in scored:
260
+ if current_score is None or item[0] == current_score:
261
+ current_group.append(item)
262
+ current_score = item[0]
263
+ else:
264
+ groups.append(current_group)
265
+ current_group = [item]
266
+ current_score = item[0]
267
+ if current_group:
268
+ groups.append(current_group)
269
+
270
+ # Shuffle within each group and flatten
271
+ result: list[dict[str, Any]] = []
272
+ for group in groups:
273
+ rng.shuffle(group)
274
+ for _, _, example in group:
275
+ result.append(example)
276
+ if len(result) == num:
277
+ return result
278
+
279
+ return result[:num]
280
+
281
+
282
+ def _similarity_score(example: dict[str, Any], query: dict[str, Any]) -> float:
283
+ """Compute a simple similarity score between an example and a query.
284
+
285
+ Checks the following keys:
286
+ - ``"category"``: +1 if equal.
287
+ - ``"language"``: +1 if equal.
288
+ - ``"difficulty"``: +1 if equal.
289
+ - ``"tags"``: +1 for each shared tag (set intersection size).
290
+
291
+ Args:
292
+ example: Candidate example dictionary.
293
+ query: Target query dictionary.
294
+
295
+ Returns:
296
+ Non-negative similarity score.
297
+ """
298
+ score = 0.0
299
+
300
+ for key in ("category", "language", "difficulty"):
301
+ if key in example and key in query and example[key] == query[key]:
302
+ score += 1.0
303
+
304
+ example_tags = set(example.get("tags", []))
305
+ query_tags = set(query.get("tags", []))
306
+ score += len(example_tags & query_tags)
307
+
308
+ return score
309
+
310
+
311
+ def _select_diverse(
312
+ pool: list[dict[str, Any]],
313
+ num: int,
314
+ seed: int | None,
315
+ ) -> list[dict[str, Any]]:
316
+ """Select diverse examples covering different categories.
317
+
318
+ Groups candidates by their ``"category"`` key (falling back to
319
+ ``"_uncategorized_"``), then round-robins across categories to
320
+ select *num* examples. Within each category, examples are shuffled
321
+ using the provided *seed* for reproducibility.
322
+
323
+ Args:
324
+ pool: Candidate examples.
325
+ num: Number of examples to select.
326
+ seed: Random seed for reproducibility.
327
+
328
+ Returns:
329
+ Diverse selection of examples from different categories.
330
+ """
331
+ rng = random.Random(seed)
332
+
333
+ # Group by category
334
+ categories: dict[str, list[dict[str, Any]]] = {}
335
+ for example in pool:
336
+ cat = example.get("category", "_uncategorized_")
337
+ if cat not in categories:
338
+ categories[cat] = []
339
+ categories[cat].append(example)
340
+
341
+ # Shuffle within each category for fairness
342
+ for cat_examples in categories.values():
343
+ rng.shuffle(cat_examples)
344
+
345
+ # Sort category keys for deterministic ordering
346
+ sorted_cats = sorted(categories.keys())
347
+
348
+ # Track current index within each category's shuffled list
349
+ cat_indices: dict[str, int] = {cat: 0 for cat in sorted_cats}
350
+
351
+ result: list[dict[str, Any]] = []
352
+ while len(result) < num:
353
+ added_any = False
354
+ for cat in sorted_cats:
355
+ if len(result) >= num:
356
+ break
357
+ idx = cat_indices[cat]
358
+ if idx < len(categories[cat]):
359
+ result.append(categories[cat][idx])
360
+ cat_indices[cat] = idx + 1
361
+ added_any = True
362
+
363
+ # If no category had remaining items, we've exhausted the pool
364
+ if not added_any:
365
+ break
366
+
367
+ return result