mcpbr 0.4.16__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/__init__.py +20 -1
  2. mcpbr/config.py +37 -1
  3. mcpbr/config_migration.py +470 -0
  4. mcpbr/config_wizard.py +647 -0
  5. mcpbr/dashboard.py +619 -0
  6. mcpbr/dataset_streaming.py +491 -0
  7. mcpbr/docker_cache.py +539 -0
  8. mcpbr/docker_env.py +2 -1
  9. mcpbr/docker_prewarm.py +370 -0
  10. mcpbr/dry_run.py +533 -0
  11. mcpbr/formatting.py +444 -0
  12. mcpbr/gpu_support.py +2 -1
  13. mcpbr/graceful_degradation.py +277 -0
  14. mcpbr/harness.py +38 -4
  15. mcpbr/languages.py +228 -0
  16. mcpbr/logging_config.py +207 -0
  17. mcpbr/models.py +66 -0
  18. mcpbr/preflight.py +2 -1
  19. mcpbr/pricing.py +72 -0
  20. mcpbr/providers.py +316 -3
  21. mcpbr/resource_limits.py +487 -0
  22. mcpbr/result_streaming.py +519 -0
  23. mcpbr/sdk.py +264 -0
  24. mcpbr/smoke_test.py +2 -1
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
  28. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/RECORD +38 -22
  29. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,519 @@
1
+ """Result streaming to external storage backends.
2
+
3
+ Streams evaluation results to external storage as each task completes,
4
+ rather than waiting for the full evaluation to finish. Supports multiple
5
+ backends (local file, S3-compatible, webhook/HTTP POST) with buffering
6
+ and retry logic. Failures in streaming never block the evaluation.
7
+ """
8
+
9
+ import asyncio
10
+ import json
11
+ import logging
12
+ import time
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path
15
+ from typing import Any, Protocol, runtime_checkable
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @runtime_checkable
21
+ class StreamBackend(Protocol):
22
+ """Protocol for result streaming backends.
23
+
24
+ All backends must implement async send, flush, and close methods.
25
+ Implementations should be fault-tolerant and never raise exceptions
26
+ that would block the evaluation pipeline.
27
+ """
28
+
29
+ async def send(self, result: dict) -> bool:
30
+ """Send a single result to the backend.
31
+
32
+ Args:
33
+ result: Evaluation result dictionary to stream.
34
+
35
+ Returns:
36
+ True if the result was successfully sent, False otherwise.
37
+ """
38
+ ...
39
+
40
+ async def flush(self) -> None:
41
+ """Flush any buffered results to the backend."""
42
+ ...
43
+
44
+ async def close(self) -> None:
45
+ """Close the backend and release any resources."""
46
+ ...
47
+
48
+
49
+ class LocalFileStream:
50
+ """Streams results to a local JSONL file.
51
+
52
+ Appends each result as a JSON line to the specified file path.
53
+ Automatically creates parent directories if they do not exist.
54
+ """
55
+
56
+ def __init__(self, path: str | Path) -> None:
57
+ """Initialize local file stream backend.
58
+
59
+ Args:
60
+ path: File path for JSONL output. Parent directories are
61
+ created automatically.
62
+ """
63
+ self._path = Path(path)
64
+ self._path.parent.mkdir(parents=True, exist_ok=True)
65
+
66
+ async def send(self, result: dict) -> bool:
67
+ """Append a result as a JSON line to the file.
68
+
69
+ Args:
70
+ result: Evaluation result dictionary.
71
+
72
+ Returns:
73
+ True if the write succeeded, False otherwise.
74
+ """
75
+ try:
76
+ loop = asyncio.get_running_loop()
77
+ await loop.run_in_executor(None, self._write_line, result)
78
+ return True
79
+ except Exception:
80
+ logger.exception("Failed to write result to %s", self._path)
81
+ return False
82
+
83
+ def _write_line(self, result: dict) -> None:
84
+ """Write a single JSON line to the file (sync, for executor).
85
+
86
+ Args:
87
+ result: Evaluation result dictionary.
88
+ """
89
+ with open(self._path, "a") as f:
90
+ f.write(json.dumps(result, default=str) + "\n")
91
+ f.flush()
92
+
93
+ async def flush(self) -> None:
94
+ """Flush is a no-op for local file; each write already flushes."""
95
+
96
+ async def close(self) -> None:
97
+ """Close is a no-op for local file stream."""
98
+
99
+
100
+ class S3Stream:
101
+ """Streams results to an S3-compatible object store.
102
+
103
+ Each result is uploaded as an individual JSON object at
104
+ ``s3://<bucket>/<prefix>/<task_id>.json``. Requires the ``boto3``
105
+ package; gracefully degrades if it is not installed.
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ bucket: str,
111
+ prefix: str = "",
112
+ region_name: str | None = None,
113
+ endpoint_url: str | None = None,
114
+ ) -> None:
115
+ """Initialize S3 stream backend.
116
+
117
+ Args:
118
+ bucket: S3 bucket name.
119
+ prefix: Key prefix for uploaded objects (e.g. ``"results/run-1"``).
120
+ region_name: AWS region name (optional).
121
+ endpoint_url: Custom endpoint URL for S3-compatible services
122
+ (e.g. MinIO). Optional.
123
+ """
124
+ self._bucket = bucket
125
+ self._prefix = prefix.strip("/")
126
+ self._client: Any = None
127
+ self._region_name = region_name
128
+ self._endpoint_url = endpoint_url
129
+ self._available = False
130
+ self._init_client()
131
+
132
+ def _init_client(self) -> None:
133
+ """Initialize the boto3 S3 client if boto3 is available."""
134
+ try:
135
+ import boto3
136
+
137
+ kwargs: dict[str, Any] = {}
138
+ if self._region_name:
139
+ kwargs["region_name"] = self._region_name
140
+ if self._endpoint_url:
141
+ kwargs["endpoint_url"] = self._endpoint_url
142
+ self._client = boto3.client("s3", **kwargs)
143
+ self._available = True
144
+ except ImportError:
145
+ logger.warning("boto3 is not installed; S3 streaming backend is disabled")
146
+ self._available = False
147
+ except Exception:
148
+ logger.exception("Failed to initialize S3 client")
149
+ self._available = False
150
+
151
+ async def send(self, result: dict) -> bool:
152
+ """Upload a result as a JSON object to S3.
153
+
154
+ The object key is derived from the ``instance_id`` field in the
155
+ result dict, falling back to a timestamp-based key.
156
+
157
+ Args:
158
+ result: Evaluation result dictionary.
159
+
160
+ Returns:
161
+ True if the upload succeeded, False otherwise.
162
+ """
163
+ if not self._available or self._client is None:
164
+ return False
165
+
166
+ try:
167
+ task_id = result.get("instance_id", f"result-{time.time()}")
168
+ key = f"{self._prefix}/{task_id}.json" if self._prefix else f"{task_id}.json"
169
+ body = json.dumps(result, default=str)
170
+
171
+ loop = asyncio.get_running_loop()
172
+ await loop.run_in_executor(
173
+ None,
174
+ lambda: self._client.put_object(
175
+ Bucket=self._bucket,
176
+ Key=key,
177
+ Body=body.encode("utf-8"),
178
+ ContentType="application/json",
179
+ ),
180
+ )
181
+ return True
182
+ except Exception:
183
+ logger.exception("Failed to upload result to S3 bucket %s", self._bucket)
184
+ return False
185
+
186
+ async def flush(self) -> None:
187
+ """Flush is a no-op for S3; each send is an individual upload."""
188
+
189
+ async def close(self) -> None:
190
+ """Close the S3 client (release resources)."""
191
+ self._client = None
192
+ self._available = False
193
+
194
+
195
+ class WebhookStream:
196
+ """Streams results via HTTP POST to a webhook URL.
197
+
198
+ Sends each result as a JSON payload. Supports configurable headers
199
+ and timeout.
200
+ """
201
+
202
+ def __init__(
203
+ self,
204
+ url: str,
205
+ headers: dict[str, str] | None = None,
206
+ timeout: float = 30.0,
207
+ ) -> None:
208
+ """Initialize webhook stream backend.
209
+
210
+ Args:
211
+ url: Webhook URL to POST results to.
212
+ headers: Optional HTTP headers to include in requests.
213
+ timeout: Request timeout in seconds.
214
+ """
215
+ self._url = url
216
+ self._headers = headers or {}
217
+ self._timeout = timeout
218
+ self._session: Any = None
219
+
220
+ def _get_session(self) -> Any:
221
+ """Get or create a requests Session (lazy init).
222
+
223
+ Returns:
224
+ A ``requests.Session`` instance.
225
+ """
226
+ if self._session is None:
227
+ import requests
228
+
229
+ self._session = requests.Session()
230
+ self._session.headers.update({"Content-Type": "application/json"})
231
+ self._session.headers.update(self._headers)
232
+ return self._session
233
+
234
+ async def send(self, result: dict) -> bool:
235
+ """POST a result as JSON to the webhook URL.
236
+
237
+ Args:
238
+ result: Evaluation result dictionary.
239
+
240
+ Returns:
241
+ True if the request returned a 2xx status, False otherwise.
242
+ """
243
+ try:
244
+ loop = asyncio.get_running_loop()
245
+ response = await loop.run_in_executor(None, self._post, result)
246
+ success = 200 <= response.status_code < 300
247
+ if not success:
248
+ logger.warning(
249
+ "Webhook returned status %d for %s",
250
+ response.status_code,
251
+ self._url,
252
+ )
253
+ return success
254
+ except Exception:
255
+ logger.exception("Failed to POST result to webhook %s", self._url)
256
+ return False
257
+
258
+ def _post(self, result: dict) -> Any:
259
+ """Perform the synchronous HTTP POST (for executor).
260
+
261
+ Args:
262
+ result: Evaluation result dictionary.
263
+
264
+ Returns:
265
+ The ``requests.Response`` object.
266
+ """
267
+ session = self._get_session()
268
+ return session.post(
269
+ self._url,
270
+ data=json.dumps(result, default=str),
271
+ timeout=self._timeout,
272
+ )
273
+
274
+ async def flush(self) -> None:
275
+ """Flush is a no-op for webhook; each send is an individual POST."""
276
+
277
+ async def close(self) -> None:
278
+ """Close the HTTP session and release resources."""
279
+ if self._session is not None:
280
+ self._session.close()
281
+ self._session = None
282
+
283
+
284
+ class ResultStreamer:
285
+ """Orchestrates streaming of results to multiple backends.
286
+
287
+ Sends each result to all configured backends with optional buffering
288
+ and retry logic. Streaming failures are logged but never propagated
289
+ to the caller, ensuring the evaluation pipeline is not blocked.
290
+
291
+ Args:
292
+ backends: List of stream backends to send results to.
293
+ buffer_size: Number of results to buffer before flushing.
294
+ A value of 1 means results are sent immediately.
295
+ max_retries: Maximum number of retry attempts per backend
296
+ on failure.
297
+ retry_delay: Base delay in seconds between retries
298
+ (doubles on each retry).
299
+ """
300
+
301
+ def __init__(
302
+ self,
303
+ backends: list[StreamBackend],
304
+ buffer_size: int = 1,
305
+ max_retries: int = 3,
306
+ retry_delay: float = 1.0,
307
+ ) -> None:
308
+ self._backends = list(backends)
309
+ self._buffer_size = max(1, buffer_size)
310
+ self._max_retries = max_retries
311
+ self._retry_delay = retry_delay
312
+ self._buffer: list[dict] = []
313
+ self._sent_count = 0
314
+ self._failed_count = 0
315
+
316
+ @property
317
+ def sent_count(self) -> int:
318
+ """Number of results successfully sent to at least one backend."""
319
+ return self._sent_count
320
+
321
+ @property
322
+ def failed_count(self) -> int:
323
+ """Number of individual backend send failures (after retries)."""
324
+ return self._failed_count
325
+
326
+ async def send(self, result: dict) -> None:
327
+ """Buffer a result and flush if the buffer is full.
328
+
329
+ Args:
330
+ result: Evaluation result dictionary to stream.
331
+ """
332
+ self._buffer.append(result)
333
+ if len(self._buffer) >= self._buffer_size:
334
+ await self.flush()
335
+
336
+ async def flush(self) -> None:
337
+ """Flush all buffered results to every backend.
338
+
339
+ Each result is sent to each backend with retry logic. Failures
340
+ are logged but do not raise exceptions.
341
+ """
342
+ if not self._buffer:
343
+ return
344
+
345
+ results_to_send = list(self._buffer)
346
+ self._buffer.clear()
347
+
348
+ for result in results_to_send:
349
+ any_success = False
350
+ for backend in self._backends:
351
+ success = await self._send_with_retry(backend, result)
352
+ if success:
353
+ any_success = True
354
+ if any_success:
355
+ self._sent_count += 1
356
+
357
+ # Flush all backends
358
+ for backend in self._backends:
359
+ try:
360
+ await backend.flush()
361
+ except Exception:
362
+ logger.exception("Failed to flush backend %s", type(backend).__name__)
363
+
364
+ async def _send_with_retry(self, backend: StreamBackend, result: dict) -> bool:
365
+ """Send a result to a single backend with retry logic.
366
+
367
+ Uses exponential backoff between retries.
368
+
369
+ Args:
370
+ backend: The stream backend to send to.
371
+ result: Evaluation result dictionary.
372
+
373
+ Returns:
374
+ True if the send eventually succeeded, False after all
375
+ retries are exhausted.
376
+ """
377
+ delay = self._retry_delay
378
+ for attempt in range(self._max_retries):
379
+ try:
380
+ success = await backend.send(result)
381
+ if success:
382
+ return True
383
+ except Exception:
384
+ logger.exception(
385
+ "Exception sending to %s (attempt %d/%d)",
386
+ type(backend).__name__,
387
+ attempt + 1,
388
+ self._max_retries,
389
+ )
390
+ if attempt < self._max_retries - 1:
391
+ await asyncio.sleep(delay)
392
+ delay *= 2
393
+
394
+ self._failed_count += 1
395
+ logger.warning(
396
+ "All %d retries exhausted for %s",
397
+ self._max_retries,
398
+ type(backend).__name__,
399
+ )
400
+ return False
401
+
402
+ async def close(self) -> None:
403
+ """Flush remaining buffer and close all backends."""
404
+ await self.flush()
405
+ for backend in self._backends:
406
+ try:
407
+ await backend.close()
408
+ except Exception:
409
+ logger.exception("Failed to close backend %s", type(backend).__name__)
410
+
411
+
412
+ @dataclass
413
+ class StreamConfig:
414
+ """Configuration for a single stream backend.
415
+
416
+ Attributes:
417
+ backend_type: Type of backend (``"local"``, ``"s3"``, ``"webhook"``).
418
+ path: File path for local backend.
419
+ url: URL for webhook backend.
420
+ bucket: S3 bucket name.
421
+ prefix: S3 key prefix.
422
+ headers: HTTP headers for webhook backend.
423
+ region_name: AWS region for S3 backend.
424
+ endpoint_url: Custom endpoint URL for S3-compatible services.
425
+ timeout: Request timeout for webhook backend (seconds).
426
+ """
427
+
428
+ backend_type: str
429
+ path: str | None = None
430
+ url: str | None = None
431
+ bucket: str | None = None
432
+ prefix: str | None = None
433
+ headers: dict[str, str] = field(default_factory=dict)
434
+ region_name: str | None = None
435
+ endpoint_url: str | None = None
436
+ timeout: float = 30.0
437
+
438
+
439
+ def create_backend(config: StreamConfig) -> StreamBackend | None:
440
+ """Create a single stream backend from configuration.
441
+
442
+ Args:
443
+ config: Stream backend configuration.
444
+
445
+ Returns:
446
+ A stream backend instance, or None if the configuration is
447
+ invalid or the backend cannot be initialized.
448
+ """
449
+ backend_type = config.backend_type.lower()
450
+
451
+ if backend_type == "local":
452
+ if not config.path:
453
+ logger.error("Local stream backend requires a 'path'")
454
+ return None
455
+ return LocalFileStream(path=config.path)
456
+
457
+ elif backend_type == "s3":
458
+ if not config.bucket:
459
+ logger.error("S3 stream backend requires a 'bucket'")
460
+ return None
461
+ return S3Stream(
462
+ bucket=config.bucket,
463
+ prefix=config.prefix or "",
464
+ region_name=config.region_name,
465
+ endpoint_url=config.endpoint_url,
466
+ )
467
+
468
+ elif backend_type == "webhook":
469
+ if not config.url:
470
+ logger.error("Webhook stream backend requires a 'url'")
471
+ return None
472
+ return WebhookStream(
473
+ url=config.url,
474
+ headers=config.headers,
475
+ timeout=config.timeout,
476
+ )
477
+
478
+ else:
479
+ logger.error("Unknown stream backend type: %s", config.backend_type)
480
+ return None
481
+
482
+
483
+ def create_streamer(
484
+ configs: list[StreamConfig],
485
+ buffer_size: int = 1,
486
+ max_retries: int = 3,
487
+ retry_delay: float = 1.0,
488
+ ) -> ResultStreamer:
489
+ """Factory function to create a ResultStreamer from configuration.
490
+
491
+ Creates backends for each valid configuration entry and returns
492
+ a ``ResultStreamer`` that dispatches to all of them.
493
+
494
+ Args:
495
+ configs: List of stream backend configurations.
496
+ buffer_size: Number of results to buffer before flushing.
497
+ max_retries: Maximum retry attempts per backend on failure.
498
+ retry_delay: Base delay in seconds between retries.
499
+
500
+ Returns:
501
+ Configured ResultStreamer instance. If no valid backends could
502
+ be created, the streamer will have an empty backend list and
503
+ all send operations will be no-ops.
504
+ """
505
+ backends: list[StreamBackend] = []
506
+ for config in configs:
507
+ backend = create_backend(config)
508
+ if backend is not None:
509
+ backends.append(backend)
510
+
511
+ if not backends:
512
+ logger.warning("No valid stream backends configured; streaming will be a no-op")
513
+
514
+ return ResultStreamer(
515
+ backends=backends,
516
+ buffer_size=buffer_size,
517
+ max_retries=max_retries,
518
+ retry_delay=retry_delay,
519
+ )