mcpbr 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcpbr/few_shot.py ADDED
@@ -0,0 +1,367 @@
1
+ """Few-shot learning support for benchmark evaluations.
2
+
3
+ Provides configurable few-shot example selection with multiple strategies
4
+ (random, similar, diverse), prompt formatting, and learning curve analysis
5
+ to study how performance changes with varying shot counts.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import random
11
+ from dataclasses import dataclass
12
+ from typing import Any
13
+
14
+
15
+ @dataclass
16
+ class FewShotConfig:
17
+ """Configuration for few-shot example selection.
18
+
19
+ Attributes:
20
+ num_shots: Number of examples to include (0 for zero-shot).
21
+ selection_strategy: Strategy for selecting examples. One of
22
+ ``"random"``, ``"similar"``, or ``"diverse"``.
23
+ seed: Random seed for reproducibility. ``None`` for non-deterministic.
24
+ """
25
+
26
+ num_shots: int = 0
27
+ selection_strategy: str = "random"
28
+ seed: int | None = None
29
+
30
+ def __post_init__(self) -> None:
31
+ """Validate configuration values after initialisation."""
32
+ if self.num_shots < 0:
33
+ raise ValueError(f"num_shots must be non-negative, got {self.num_shots}")
34
+
35
+ valid_strategies = {"random", "similar", "diverse"}
36
+ if self.selection_strategy not in valid_strategies:
37
+ raise ValueError(
38
+ f"selection_strategy must be one of {valid_strategies}, "
39
+ f"got {self.selection_strategy!r}"
40
+ )
41
+
42
+
43
+ def select_examples(
44
+ pool: list[dict[str, Any]],
45
+ query: dict[str, Any],
46
+ config: FewShotConfig,
47
+ ) -> list[dict[str, Any]]:
48
+ """Select few-shot examples from a pool based on the given configuration.
49
+
50
+ Args:
51
+ pool: List of candidate example dictionaries to select from.
52
+ query: The query/task dictionary that examples are being selected for.
53
+ Used by the ``"similar"`` strategy to find related examples.
54
+ config: Few-shot configuration controlling selection behaviour.
55
+
56
+ Returns:
57
+ List of selected example dictionaries. Length is
58
+ ``min(config.num_shots, len(pool))``.
59
+ """
60
+ if config.num_shots == 0 or not pool:
61
+ return []
62
+
63
+ num = min(config.num_shots, len(pool))
64
+
65
+ if config.selection_strategy == "random":
66
+ return _select_random(pool, num, config.seed)
67
+ elif config.selection_strategy == "similar":
68
+ return _select_similar(pool, query, num, config.seed)
69
+ elif config.selection_strategy == "diverse":
70
+ return _select_diverse(pool, num, config.seed)
71
+ else:
72
+ raise ValueError(f"Unknown selection strategy: {config.selection_strategy!r}")
73
+
74
+
75
+ def format_few_shot_prompt(
76
+ examples: list[dict[str, Any]],
77
+ query: dict[str, Any],
78
+ template: str,
79
+ ) -> str:
80
+ """Format a prompt string with few-shot examples and the target query.
81
+
82
+ The *template* string should contain ``{examples}`` and ``{query}``
83
+ placeholders. Each example is formatted as a numbered block using the
84
+ example's string representation.
85
+
86
+ Args:
87
+ examples: List of few-shot example dictionaries.
88
+ query: The target query dictionary.
89
+ template: A format string containing ``{examples}`` and ``{query}``
90
+ placeholders.
91
+
92
+ Returns:
93
+ The fully formatted prompt string.
94
+ """
95
+ if not examples:
96
+ examples_text = "No examples provided."
97
+ else:
98
+ parts: list[str] = []
99
+ for i, example in enumerate(examples, 1):
100
+ parts.append(f"Example {i}:\n{_format_dict(example)}")
101
+ examples_text = "\n\n".join(parts)
102
+
103
+ query_text = _format_dict(query)
104
+
105
+ return template.format(examples=examples_text, query=query_text)
106
+
107
+
108
+ def compute_learning_curve(
109
+ results_by_shots: dict[int, list[dict[str, Any]]],
110
+ ) -> dict[str, Any]:
111
+ """Analyse how performance changes with the number of few-shot examples.
112
+
113
+ For each shot count, computes accuracy (fraction of results where
114
+ ``"resolved"`` is truthy), average cost, and average token usage.
115
+
116
+ Args:
117
+ results_by_shots: Mapping from shot count (int) to a list of result
118
+ dictionaries. Each result dict may contain keys such as
119
+ ``"resolved"`` (bool), ``"cost"`` (float), and
120
+ ``"tokens"`` (dict with ``"input"`` and ``"output"``).
121
+
122
+ Returns:
123
+ Dictionary with keys:
124
+
125
+ - ``"shot_counts"``: sorted list of shot counts.
126
+ - ``"accuracy"``: list of accuracy values corresponding to each shot count.
127
+ - ``"avg_cost"``: list of average cost values.
128
+ - ``"avg_tokens"``: list of average total token counts.
129
+ - ``"num_samples"``: list of sample counts for each shot count.
130
+ """
131
+ if not results_by_shots:
132
+ return {
133
+ "shot_counts": [],
134
+ "accuracy": [],
135
+ "avg_cost": [],
136
+ "avg_tokens": [],
137
+ "num_samples": [],
138
+ }
139
+
140
+ sorted_shots = sorted(results_by_shots.keys())
141
+ accuracies: list[float] = []
142
+ avg_costs: list[float] = []
143
+ avg_tokens: list[float] = []
144
+ num_samples: list[int] = []
145
+
146
+ for shot_count in sorted_shots:
147
+ results = results_by_shots[shot_count]
148
+ n = len(results)
149
+ num_samples.append(n)
150
+
151
+ if n == 0:
152
+ accuracies.append(0.0)
153
+ avg_costs.append(0.0)
154
+ avg_tokens.append(0.0)
155
+ continue
156
+
157
+ # Accuracy
158
+ resolved = sum(1 for r in results if r.get("resolved"))
159
+ accuracies.append(resolved / n)
160
+
161
+ # Average cost
162
+ total_cost = sum(r.get("cost", 0.0) for r in results)
163
+ avg_costs.append(total_cost / n)
164
+
165
+ # Average tokens
166
+ total_tokens = 0
167
+ for r in results:
168
+ tokens = r.get("tokens", {})
169
+ total_tokens += tokens.get("input", 0) + tokens.get("output", 0)
170
+ avg_tokens.append(total_tokens / n)
171
+
172
+ return {
173
+ "shot_counts": sorted_shots,
174
+ "accuracy": accuracies,
175
+ "avg_cost": avg_costs,
176
+ "avg_tokens": avg_tokens,
177
+ "num_samples": num_samples,
178
+ }
179
+
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # Internal helpers
183
+ # ---------------------------------------------------------------------------
184
+
185
+
186
+ def _format_dict(d: dict[str, Any]) -> str:
187
+ """Format a dictionary as a human-readable key-value block.
188
+
189
+ Args:
190
+ d: Dictionary to format.
191
+
192
+ Returns:
193
+ Multi-line string with one ``key: value`` pair per line.
194
+ """
195
+ lines = [f" {k}: {v}" for k, v in d.items()]
196
+ return "\n".join(lines) if lines else " (empty)"
197
+
198
+
199
+ def _select_random(
200
+ pool: list[dict[str, Any]],
201
+ num: int,
202
+ seed: int | None,
203
+ ) -> list[dict[str, Any]]:
204
+ """Randomly select *num* examples from *pool*.
205
+
206
+ Args:
207
+ pool: Candidate examples.
208
+ num: Number of examples to select.
209
+ seed: Random seed for reproducibility.
210
+
211
+ Returns:
212
+ Randomly selected examples.
213
+ """
214
+ rng = random.Random(seed)
215
+ return rng.sample(pool, num)
216
+
217
+
218
+ def _select_similar(
219
+ pool: list[dict[str, Any]],
220
+ query: dict[str, Any],
221
+ num: int,
222
+ seed: int | None,
223
+ ) -> list[dict[str, Any]]:
224
+ """Select examples most similar to *query* by shared category/tags.
225
+
226
+ Similarity is computed as the number of matching metadata values
227
+ for the keys ``"category"``, ``"tags"``, ``"language"``, and ``"difficulty"``.
228
+ For the ``"tags"`` key the score is the size of the intersection.
229
+
230
+ When there are ties in similarity score, a seeded RNG is used to
231
+ shuffle tied candidates for deterministic tie-breaking.
232
+
233
+ Args:
234
+ pool: Candidate examples.
235
+ query: The query dictionary to compare against.
236
+ num: Number of examples to select.
237
+ seed: Random seed for reproducibility in tie-breaking.
238
+
239
+ Returns:
240
+ Examples sorted by descending similarity, with ties broken
241
+ deterministically when *seed* is provided.
242
+ """
243
+ rng = random.Random(seed)
244
+
245
+ scored: list[tuple[float, int, dict[str, Any]]] = []
246
+ for idx, example in enumerate(pool):
247
+ score = _similarity_score(example, query)
248
+ scored.append((score, idx, example))
249
+
250
+ # Group by score and shuffle within each group for deterministic tie-breaking
251
+ # Sort descending by score first
252
+ scored.sort(key=lambda x: x[0], reverse=True)
253
+
254
+ # Build groups of tied scores
255
+ groups: list[list[tuple[float, int, dict[str, Any]]]] = []
256
+ current_group: list[tuple[float, int, dict[str, Any]]] = []
257
+ current_score: float | None = None
258
+
259
+ for item in scored:
260
+ if current_score is None or item[0] == current_score:
261
+ current_group.append(item)
262
+ current_score = item[0]
263
+ else:
264
+ groups.append(current_group)
265
+ current_group = [item]
266
+ current_score = item[0]
267
+ if current_group:
268
+ groups.append(current_group)
269
+
270
+ # Shuffle within each group and flatten
271
+ result: list[dict[str, Any]] = []
272
+ for group in groups:
273
+ rng.shuffle(group)
274
+ for _, _, example in group:
275
+ result.append(example)
276
+ if len(result) == num:
277
+ return result
278
+
279
+ return result[:num]
280
+
281
+
282
+ def _similarity_score(example: dict[str, Any], query: dict[str, Any]) -> float:
283
+ """Compute a simple similarity score between an example and a query.
284
+
285
+ Checks the following keys:
286
+ - ``"category"``: +1 if equal.
287
+ - ``"language"``: +1 if equal.
288
+ - ``"difficulty"``: +1 if equal.
289
+ - ``"tags"``: +1 for each shared tag (set intersection size).
290
+
291
+ Args:
292
+ example: Candidate example dictionary.
293
+ query: Target query dictionary.
294
+
295
+ Returns:
296
+ Non-negative similarity score.
297
+ """
298
+ score = 0.0
299
+
300
+ for key in ("category", "language", "difficulty"):
301
+ if key in example and key in query and example[key] == query[key]:
302
+ score += 1.0
303
+
304
+ example_tags = set(example.get("tags", []))
305
+ query_tags = set(query.get("tags", []))
306
+ score += len(example_tags & query_tags)
307
+
308
+ return score
309
+
310
+
311
+ def _select_diverse(
312
+ pool: list[dict[str, Any]],
313
+ num: int,
314
+ seed: int | None,
315
+ ) -> list[dict[str, Any]]:
316
+ """Select diverse examples covering different categories.
317
+
318
+ Groups candidates by their ``"category"`` key (falling back to
319
+ ``"_uncategorized_"``), then round-robins across categories to
320
+ select *num* examples. Within each category, examples are shuffled
321
+ using the provided *seed* for reproducibility.
322
+
323
+ Args:
324
+ pool: Candidate examples.
325
+ num: Number of examples to select.
326
+ seed: Random seed for reproducibility.
327
+
328
+ Returns:
329
+ Diverse selection of examples from different categories.
330
+ """
331
+ rng = random.Random(seed)
332
+
333
+ # Group by category
334
+ categories: dict[str, list[dict[str, Any]]] = {}
335
+ for example in pool:
336
+ cat = example.get("category", "_uncategorized_")
337
+ if cat not in categories:
338
+ categories[cat] = []
339
+ categories[cat].append(example)
340
+
341
+ # Shuffle within each category for fairness
342
+ for cat_examples in categories.values():
343
+ rng.shuffle(cat_examples)
344
+
345
+ # Sort category keys for deterministic ordering
346
+ sorted_cats = sorted(categories.keys())
347
+
348
+ # Track current index within each category's shuffled list
349
+ cat_indices: dict[str, int] = {cat: 0 for cat in sorted_cats}
350
+
351
+ result: list[dict[str, Any]] = []
352
+ while len(result) < num:
353
+ added_any = False
354
+ for cat in sorted_cats:
355
+ if len(result) >= num:
356
+ break
357
+ idx = cat_indices[cat]
358
+ if idx < len(categories[cat]):
359
+ result.append(categories[cat][idx])
360
+ cat_indices[cat] = idx + 1
361
+ added_any = True
362
+
363
+ # If no category had remaining items, we've exhausted the pool
364
+ if not added_any:
365
+ break
366
+
367
+ return result
mcpbr/gpu_support.py ADDED
@@ -0,0 +1,157 @@
1
+ """GPU support for Docker containers used in ML benchmark evaluations.
2
+
3
+ Provides detection of available GPUs (NVIDIA), Docker GPU runtime checks,
4
+ and Docker container configuration for GPU access.
5
+ """
6
+
7
+ import logging
8
+ import subprocess
9
+
10
+ import docker
11
+ import docker.types
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def detect_gpus() -> dict:
17
+ """Detect available GPUs on the host system.
18
+
19
+ Checks for NVIDIA GPUs via nvidia-smi and verifies the Docker GPU runtime
20
+ is available.
21
+
22
+ Returns:
23
+ Dictionary with GPU detection results:
24
+ - nvidia_available (bool): Whether NVIDIA GPUs were detected.
25
+ - gpu_count (int): Number of GPUs found.
26
+ - gpu_names (list[str]): Names of detected GPUs.
27
+ - driver_version (str): NVIDIA driver version, or empty string.
28
+ - docker_runtime_available (bool): Whether Docker NVIDIA runtime is available.
29
+ """
30
+ info: dict = {
31
+ "nvidia_available": False,
32
+ "gpu_count": 0,
33
+ "gpu_names": [],
34
+ "driver_version": "",
35
+ "docker_runtime_available": False,
36
+ }
37
+
38
+ # Detect NVIDIA GPUs via nvidia-smi
39
+ try:
40
+ result = subprocess.run(
41
+ [
42
+ "nvidia-smi",
43
+ "--query-gpu=name,driver_version",
44
+ "--format=csv,noheader,nounits",
45
+ ],
46
+ capture_output=True,
47
+ text=True,
48
+ timeout=10,
49
+ )
50
+ if result.returncode == 0 and result.stdout.strip():
51
+ lines = result.stdout.strip().splitlines()
52
+ gpu_names = []
53
+ driver_version = ""
54
+ for line in lines:
55
+ parts = [p.strip() for p in line.split(",")]
56
+ if len(parts) >= 2:
57
+ gpu_names.append(parts[0])
58
+ driver_version = parts[1]
59
+ elif len(parts) == 1:
60
+ gpu_names.append(parts[0])
61
+
62
+ info["nvidia_available"] = True
63
+ info["gpu_count"] = len(gpu_names)
64
+ info["gpu_names"] = gpu_names
65
+ info["driver_version"] = driver_version
66
+ except FileNotFoundError:
67
+ logger.debug("nvidia-smi not found; no NVIDIA GPUs detected.")
68
+ except subprocess.TimeoutExpired:
69
+ logger.warning("nvidia-smi timed out while detecting GPUs.")
70
+ except Exception as e:
71
+ logger.debug(f"GPU detection failed: {e}")
72
+
73
+ # Check Docker GPU runtime availability
74
+ info["docker_runtime_available"] = check_gpu_runtime()
75
+
76
+ return info
77
+
78
+
79
+ def get_docker_gpu_config(gpu_enabled: bool) -> dict:
80
+ """Return Docker container creation kwargs for GPU access.
81
+
82
+ When gpu_enabled is True, returns a dictionary containing a DeviceRequest
83
+ that grants access to all available NVIDIA GPUs. This dict can be merged
84
+ into the kwargs passed to ``docker.containers.run()`` or
85
+ ``docker.containers.create()``.
86
+
87
+ Args:
88
+ gpu_enabled: Whether to enable GPU access in the container.
89
+
90
+ Returns:
91
+ Dictionary of Docker container kwargs. Empty dict if gpu_enabled is False.
92
+ When True, contains ``device_requests`` with a DeviceRequest for all GPUs.
93
+ """
94
+ if not gpu_enabled:
95
+ return {}
96
+
97
+ return {
98
+ "device_requests": [
99
+ docker.types.DeviceRequest(
100
+ count=-1,
101
+ capabilities=[["gpu"]],
102
+ )
103
+ ],
104
+ }
105
+
106
+
107
+ def check_gpu_runtime() -> bool:
108
+ """Check if Docker has the NVIDIA runtime available.
109
+
110
+ Queries the Docker daemon info for registered runtimes and checks
111
+ whether the ``nvidia`` runtime is among them.
112
+
113
+ Returns:
114
+ True if the NVIDIA Docker runtime is available, False otherwise.
115
+ """
116
+ try:
117
+ client = docker.from_env()
118
+ docker_info = client.info()
119
+ runtimes = docker_info.get("Runtimes", {})
120
+ return "nvidia" in runtimes
121
+ except docker.errors.DockerException as e:
122
+ logger.debug(f"Could not query Docker for GPU runtime: {e}")
123
+ return False
124
+ except Exception as e:
125
+ logger.debug(f"Unexpected error checking Docker GPU runtime: {e}")
126
+ return False
127
+
128
+
129
+ def format_gpu_info(info: dict) -> str:
130
+ """Format GPU detection info as a human-readable string.
131
+
132
+ Args:
133
+ info: Dictionary returned by ``detect_gpus()``.
134
+
135
+ Returns:
136
+ Human-readable multi-line string describing the GPU environment.
137
+ """
138
+ lines: list[str] = []
139
+
140
+ if not info.get("nvidia_available"):
141
+ lines.append("No NVIDIA GPUs detected.")
142
+ else:
143
+ gpu_count = info.get("gpu_count", 0)
144
+ lines.append(f"NVIDIA GPUs detected: {gpu_count}")
145
+
146
+ gpu_names = info.get("gpu_names", [])
147
+ for i, name in enumerate(gpu_names):
148
+ lines.append(f" GPU {i}: {name}")
149
+
150
+ driver_version = info.get("driver_version", "")
151
+ if driver_version:
152
+ lines.append(f"Driver version: {driver_version}")
153
+
154
+ runtime_available = info.get("docker_runtime_available", False)
155
+ lines.append(f"Docker NVIDIA runtime: {'available' if runtime_available else 'not available'}")
156
+
157
+ return "\n".join(lines)
mcpbr/harness.py CHANGED
@@ -1125,6 +1125,10 @@ async def run_evaluation(
1125
1125
  # Wait for cancellation to complete
1126
1126
  await asyncio.gather(*async_tasks, return_exceptions=True)
1127
1127
  break
1128
+
1129
+ # Explicitly stop before exiting context to avoid
1130
+ # deadlock between Rich's rendering thread and asyncio
1131
+ progress.stop()
1128
1132
  else:
1129
1133
  # In non-verbose mode, show overall progress bar + per-task spinners
1130
1134
  with Progress(
@@ -1172,6 +1176,10 @@ async def run_evaluation(
1172
1176
  # Wait for cancellation to complete
1173
1177
  await asyncio.gather(*async_tasks, return_exceptions=True)
1174
1178
  break
1179
+
1180
+ # Explicitly stop before exiting context to avoid
1181
+ # deadlock between Rich's rendering thread and asyncio
1182
+ progress.stop()
1175
1183
  finally:
1176
1184
  await docker_manager.cleanup_all()
1177
1185