mcpbr 0.4.16__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/__init__.py +20 -1
  2. mcpbr/config.py +37 -1
  3. mcpbr/config_migration.py +470 -0
  4. mcpbr/config_wizard.py +647 -0
  5. mcpbr/dashboard.py +619 -0
  6. mcpbr/dataset_streaming.py +491 -0
  7. mcpbr/docker_cache.py +539 -0
  8. mcpbr/docker_env.py +2 -1
  9. mcpbr/docker_prewarm.py +370 -0
  10. mcpbr/dry_run.py +533 -0
  11. mcpbr/formatting.py +444 -0
  12. mcpbr/gpu_support.py +2 -1
  13. mcpbr/graceful_degradation.py +277 -0
  14. mcpbr/harness.py +38 -4
  15. mcpbr/languages.py +228 -0
  16. mcpbr/logging_config.py +207 -0
  17. mcpbr/models.py +66 -0
  18. mcpbr/preflight.py +2 -1
  19. mcpbr/pricing.py +72 -0
  20. mcpbr/providers.py +316 -3
  21. mcpbr/resource_limits.py +487 -0
  22. mcpbr/result_streaming.py +519 -0
  23. mcpbr/sdk.py +264 -0
  24. mcpbr/smoke_test.py +2 -1
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
  28. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/RECORD +38 -22
  29. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,468 @@
1
+ """Task prioritization and scheduling for benchmark evaluations.
2
+
3
+ Provides intelligent task ordering strategies to optimize benchmark runs
4
+ for speed, cost, coverage diversity, or custom scoring functions. Tasks
5
+ can be reordered before execution to get faster feedback, reduce costs,
6
+ or ensure diverse coverage across repositories and categories.
7
+
8
+ Addresses GitHub issue #92: Task Prioritization and Scheduling.
9
+ """
10
+
11
+ from collections import defaultdict
12
+ from collections.abc import Callable
13
+ from dataclasses import dataclass, field
14
+ from enum import Enum
15
+ from typing import Any
16
+
17
+
18
+ class SchedulingStrategy(Enum):
19
+ """Strategy for ordering benchmark tasks before execution.
20
+
21
+ Attributes:
22
+ DEFAULT: Preserve original task order (no reordering).
23
+ SPEED_FIRST: Run fastest tasks first for quick feedback.
24
+ COST_FIRST: Run cheapest tasks first to minimize early spend.
25
+ COVERAGE_FIRST: Round-robin across categories/repos for diverse early results.
26
+ CUSTOM: Use a user-provided scoring function.
27
+ """
28
+
29
+ DEFAULT = "default"
30
+ SPEED_FIRST = "speed"
31
+ COST_FIRST = "cost"
32
+ COVERAGE_FIRST = "coverage"
33
+ CUSTOM = "custom"
34
+
35
+
36
+ @dataclass
37
+ class TaskPriority:
38
+ """Priority metadata for a single benchmark task.
39
+
40
+ Attributes:
41
+ task_id: Unique identifier for the task (e.g., instance_id).
42
+ priority_score: Computed priority score (lower = higher priority / runs first).
43
+ estimated_time_seconds: Rough estimate of task execution time in seconds.
44
+ estimated_cost_usd: Rough estimate of task cost in USD.
45
+ category: Category or grouping key (e.g., repo name, difficulty level).
46
+ metadata: Additional metadata associated with the task.
47
+ """
48
+
49
+ task_id: str
50
+ priority_score: float = 0.0
51
+ estimated_time_seconds: float | None = None
52
+ estimated_cost_usd: float | None = None
53
+ category: str | None = None
54
+ metadata: dict[str, Any] = field(default_factory=dict)
55
+
56
+
57
+ # Default token-per-character ratio for cost estimation.
58
+ # Based on empirical observation that ~4 characters is roughly 1 token for English text.
59
+ _DEFAULT_CHARS_PER_TOKEN = 4
60
+
61
+ # Default assumed output-to-input token ratio for agent tasks.
62
+ # Agents typically produce 3-5x more output than input for coding tasks.
63
+ _DEFAULT_OUTPUT_INPUT_RATIO = 4.0
64
+
65
+ # Baseline seconds per 1000 characters of problem statement.
66
+ # Longer problems tend to require more exploration and tool calls.
67
+ _DEFAULT_SECONDS_PER_KCHAR = 30.0
68
+
69
+ # Minimum estimated time for any task (seconds).
70
+ _MIN_ESTIMATED_TIME = 30.0
71
+
72
+ # Minimum estimated cost for any task (USD).
73
+ _MIN_ESTIMATED_COST = 0.001
74
+
75
+
76
+ class TaskScheduler:
77
+ """Scheduler that reorders benchmark tasks based on a chosen strategy.
78
+
79
+ The scheduler assigns priority scores to tasks and returns them in
80
+ sorted order. It supports preset strategies (speed, cost, coverage)
81
+ and custom scoring functions.
82
+
83
+ Args:
84
+ strategy: The scheduling strategy to use.
85
+ custom_scorer: A callable that takes a task dict and returns a float
86
+ priority score (lower = runs first). Required when strategy is CUSTOM.
87
+
88
+ Raises:
89
+ ValueError: If strategy is CUSTOM but no custom_scorer is provided.
90
+
91
+ Example:
92
+ >>> scheduler = TaskScheduler(strategy=SchedulingStrategy.SPEED_FIRST)
93
+ >>> ordered = scheduler.schedule(tasks)
94
+ >>> print(scheduler.preview(tasks))
95
+ """
96
+
97
+ def __init__(
98
+ self,
99
+ strategy: SchedulingStrategy = SchedulingStrategy.DEFAULT,
100
+ custom_scorer: Callable[[dict[str, Any]], float] | None = None,
101
+ ) -> None:
102
+ if strategy == SchedulingStrategy.CUSTOM and custom_scorer is None:
103
+ raise ValueError(
104
+ "custom_scorer is required when strategy is CUSTOM. "
105
+ "Provide a callable that takes a task dict and returns a float score."
106
+ )
107
+ self._strategy = strategy
108
+ self._custom_scorer = custom_scorer
109
+
110
+ @property
111
+ def strategy(self) -> SchedulingStrategy:
112
+ """The active scheduling strategy."""
113
+ return self._strategy
114
+
115
+ def schedule(self, tasks: list[dict[str, Any]]) -> list[dict[str, Any]]:
116
+ """Reorder tasks according to the active scheduling strategy.
117
+
118
+ Args:
119
+ tasks: List of task dictionaries to schedule. Each task should have
120
+ at least an ``instance_id`` key. Additional keys like
121
+ ``problem_statement``, ``repo``, and ``category`` improve
122
+ estimation accuracy.
123
+
124
+ Returns:
125
+ A new list of task dictionaries in the scheduled order.
126
+ The original list is not mutated.
127
+ """
128
+ if not tasks:
129
+ return []
130
+
131
+ if self._strategy == SchedulingStrategy.DEFAULT:
132
+ return list(tasks)
133
+
134
+ priorities = self._compute_priorities(tasks)
135
+
136
+ # Sort by priority_score ascending (lower = runs first)
137
+ priorities.sort(key=lambda p: p.priority_score)
138
+
139
+ # Build task lookup by id for efficient reordering
140
+ task_by_id: dict[str, dict[str, Any]] = {}
141
+ for task in tasks:
142
+ tid = task.get("instance_id", str(id(task)))
143
+ task_by_id[tid] = task
144
+
145
+ return [task_by_id[p.task_id] for p in priorities]
146
+
147
+ def preview(self, tasks: list[dict[str, Any]]) -> str:
148
+ """Generate a human-readable preview of the scheduled task order.
149
+
150
+ Args:
151
+ tasks: List of task dictionaries to preview.
152
+
153
+ Returns:
154
+ A formatted string showing the scheduled order with priority
155
+ details, suitable for display before execution.
156
+ """
157
+ if not tasks:
158
+ return "No tasks to schedule."
159
+
160
+ scheduled = self.schedule(tasks)
161
+ priorities = self._compute_priorities(tasks)
162
+ priorities.sort(key=lambda p: p.priority_score)
163
+
164
+ # Build a lookup for priority info
165
+ priority_by_id: dict[str, TaskPriority] = {p.task_id: p for p in priorities}
166
+
167
+ lines: list[str] = []
168
+ lines.append(f"Schedule Preview (strategy: {self._strategy.value})")
169
+ lines.append(f"Total tasks: {len(scheduled)}")
170
+ lines.append("-" * 70)
171
+ lines.append(f"{'#':<4} {'Task ID':<35} {'Score':<8} {'Est. Time':<12} {'Est. Cost':<10}")
172
+ lines.append("-" * 70)
173
+
174
+ for i, task in enumerate(scheduled, start=1):
175
+ tid = task.get("instance_id", str(id(task)))
176
+ priority = priority_by_id.get(tid)
177
+
178
+ if priority is not None:
179
+ score_str = f"{priority.priority_score:.2f}"
180
+ time_str = (
181
+ f"{priority.estimated_time_seconds:.0f}s"
182
+ if priority.estimated_time_seconds is not None
183
+ else "N/A"
184
+ )
185
+ cost_str = (
186
+ f"${priority.estimated_cost_usd:.4f}"
187
+ if priority.estimated_cost_usd is not None
188
+ else "N/A"
189
+ )
190
+ else:
191
+ score_str = "N/A"
192
+ time_str = "N/A"
193
+ cost_str = "N/A"
194
+
195
+ lines.append(f"{i:<4} {tid:<35} {score_str:<8} {time_str:<12} {cost_str:<10}")
196
+
197
+ lines.append("-" * 70)
198
+ return "\n".join(lines)
199
+
200
+ def estimate_task_cost(self, task: dict[str, Any], model: str = "sonnet") -> float:
201
+ """Estimate the cost of running a single task in USD.
202
+
203
+ The estimate is based on the length of the problem statement and
204
+ the model's pricing. Longer problems produce more tokens and cost more.
205
+
206
+ Args:
207
+ task: Task dictionary, ideally containing a ``problem_statement`` key.
208
+ model: Model identifier used for pricing lookup (default: ``"sonnet"``).
209
+
210
+ Returns:
211
+ Estimated cost in USD. Returns ``_MIN_ESTIMATED_COST`` if pricing
212
+ data is unavailable or the problem statement is missing.
213
+ """
214
+ from .pricing import get_model_pricing
215
+
216
+ problem = task.get("problem_statement", "")
217
+ problem_len = len(problem) if isinstance(problem, str) else 0
218
+
219
+ pricing = get_model_pricing(model)
220
+ if pricing is None:
221
+ return _MIN_ESTIMATED_COST
222
+
223
+ # Estimate input tokens from problem length
224
+ input_tokens = max(problem_len / _DEFAULT_CHARS_PER_TOKEN, 100)
225
+
226
+ # Estimate output tokens as a multiple of input
227
+ output_tokens = input_tokens * _DEFAULT_OUTPUT_INPUT_RATIO
228
+
229
+ # Calculate cost in USD
230
+ input_cost = (input_tokens / 1_000_000) * pricing.input_price_per_mtok
231
+ output_cost = (output_tokens / 1_000_000) * pricing.output_price_per_mtok
232
+
233
+ return max(input_cost + output_cost, _MIN_ESTIMATED_COST)
234
+
235
+ def estimate_task_time(self, task: dict[str, Any]) -> float:
236
+ """Estimate the execution time for a single task in seconds.
237
+
238
+ The estimate is based on the length of the problem statement.
239
+ Longer problems typically require more exploration time.
240
+
241
+ Args:
242
+ task: Task dictionary, ideally containing a ``problem_statement`` key.
243
+
244
+ Returns:
245
+ Estimated execution time in seconds (minimum ``_MIN_ESTIMATED_TIME``).
246
+ """
247
+ problem = task.get("problem_statement", "")
248
+ problem_len = len(problem) if isinstance(problem, str) else 0
249
+
250
+ # Scale linearly with problem length
251
+ estimated = (problem_len / 1000) * _DEFAULT_SECONDS_PER_KCHAR
252
+
253
+ return max(estimated, _MIN_ESTIMATED_TIME)
254
+
255
+ def _compute_priorities(self, tasks: list[dict[str, Any]]) -> list[TaskPriority]:
256
+ """Compute priority scores for all tasks based on the active strategy.
257
+
258
+ Args:
259
+ tasks: List of task dictionaries.
260
+
261
+ Returns:
262
+ List of TaskPriority objects with computed scores.
263
+ """
264
+ if self._strategy == SchedulingStrategy.SPEED_FIRST:
265
+ return self._prioritize_by_speed(tasks)
266
+ elif self._strategy == SchedulingStrategy.COST_FIRST:
267
+ return self._prioritize_by_cost(tasks)
268
+ elif self._strategy == SchedulingStrategy.COVERAGE_FIRST:
269
+ return self._prioritize_by_coverage(tasks)
270
+ elif self._strategy == SchedulingStrategy.CUSTOM:
271
+ return self._prioritize_by_custom(tasks)
272
+ else:
273
+ # DEFAULT: preserve original order via index-based scoring
274
+ return [
275
+ TaskPriority(
276
+ task_id=task.get("instance_id", str(id(task))),
277
+ priority_score=float(i),
278
+ )
279
+ for i, task in enumerate(tasks)
280
+ ]
281
+
282
+ def _prioritize_by_speed(self, tasks: list[dict[str, Any]]) -> list[TaskPriority]:
283
+ """Assign priority scores based on estimated execution time (ascending).
284
+
285
+ Args:
286
+ tasks: List of task dictionaries.
287
+
288
+ Returns:
289
+ List of TaskPriority objects scored by estimated time.
290
+ """
291
+ priorities: list[TaskPriority] = []
292
+ for task in tasks:
293
+ tid = task.get("instance_id", str(id(task)))
294
+ est_time = self.estimate_task_time(task)
295
+ est_cost = self.estimate_task_cost(task)
296
+ category = _extract_category(task)
297
+
298
+ priorities.append(
299
+ TaskPriority(
300
+ task_id=tid,
301
+ priority_score=est_time,
302
+ estimated_time_seconds=est_time,
303
+ estimated_cost_usd=est_cost,
304
+ category=category,
305
+ )
306
+ )
307
+ return priorities
308
+
309
+ def _prioritize_by_cost(self, tasks: list[dict[str, Any]]) -> list[TaskPriority]:
310
+ """Assign priority scores based on estimated cost (ascending).
311
+
312
+ Args:
313
+ tasks: List of task dictionaries.
314
+
315
+ Returns:
316
+ List of TaskPriority objects scored by estimated cost.
317
+ """
318
+ priorities: list[TaskPriority] = []
319
+ for task in tasks:
320
+ tid = task.get("instance_id", str(id(task)))
321
+ est_time = self.estimate_task_time(task)
322
+ est_cost = self.estimate_task_cost(task)
323
+ category = _extract_category(task)
324
+
325
+ priorities.append(
326
+ TaskPriority(
327
+ task_id=tid,
328
+ priority_score=est_cost,
329
+ estimated_time_seconds=est_time,
330
+ estimated_cost_usd=est_cost,
331
+ category=category,
332
+ )
333
+ )
334
+ return priorities
335
+
336
+ def _prioritize_by_coverage(self, tasks: list[dict[str, Any]]) -> list[TaskPriority]:
337
+ """Assign priority scores using round-robin across categories.
338
+
339
+ Tasks are grouped by category (repo, difficulty, or explicit category),
340
+ then interleaved so that early execution covers diverse categories.
341
+
342
+ Args:
343
+ tasks: List of task dictionaries.
344
+
345
+ Returns:
346
+ List of TaskPriority objects with interleaved category ordering.
347
+ """
348
+ # Group tasks by category
349
+ groups: dict[str, list[dict[str, Any]]] = defaultdict(list)
350
+ for task in tasks:
351
+ category = _extract_category(task) or "_uncategorized_"
352
+ groups[category].append(task)
353
+
354
+ # Sort group keys for deterministic ordering
355
+ sorted_keys = sorted(groups.keys())
356
+
357
+ # Round-robin interleave: take one task from each category in turn
358
+ result: list[TaskPriority] = []
359
+ score = 0.0
360
+ max_group_len = max(len(g) for g in groups.values()) if groups else 0
361
+
362
+ for round_idx in range(max_group_len):
363
+ for key in sorted_keys:
364
+ group = groups[key]
365
+ if round_idx < len(group):
366
+ task = group[round_idx]
367
+ tid = task.get("instance_id", str(id(task)))
368
+ est_time = self.estimate_task_time(task)
369
+ est_cost = self.estimate_task_cost(task)
370
+
371
+ result.append(
372
+ TaskPriority(
373
+ task_id=tid,
374
+ priority_score=score,
375
+ estimated_time_seconds=est_time,
376
+ estimated_cost_usd=est_cost,
377
+ category=key,
378
+ )
379
+ )
380
+ score += 1.0
381
+
382
+ return result
383
+
384
+ def _prioritize_by_custom(self, tasks: list[dict[str, Any]]) -> list[TaskPriority]:
385
+ """Assign priority scores using the user-provided custom scorer.
386
+
387
+ Args:
388
+ tasks: List of task dictionaries.
389
+
390
+ Returns:
391
+ List of TaskPriority objects scored by the custom function.
392
+
393
+ Raises:
394
+ RuntimeError: If custom_scorer is None (should not happen due to
395
+ __init__ validation).
396
+ """
397
+ if self._custom_scorer is None:
398
+ raise RuntimeError("custom_scorer is None but strategy is CUSTOM")
399
+
400
+ priorities: list[TaskPriority] = []
401
+ for task in tasks:
402
+ tid = task.get("instance_id", str(id(task)))
403
+ score = self._custom_scorer(task)
404
+ est_time = self.estimate_task_time(task)
405
+ est_cost = self.estimate_task_cost(task)
406
+ category = _extract_category(task)
407
+
408
+ priorities.append(
409
+ TaskPriority(
410
+ task_id=tid,
411
+ priority_score=score,
412
+ estimated_time_seconds=est_time,
413
+ estimated_cost_usd=est_cost,
414
+ category=category,
415
+ )
416
+ )
417
+ return priorities
418
+
419
+
420
+ def _extract_category(task: dict[str, Any]) -> str | None:
421
+ """Extract a category label from a task dictionary.
422
+
423
+ Checks common fields in order of preference: ``category``, ``repo``,
424
+ ``difficulty``. Returns the first non-empty string found, or None.
425
+
426
+ Args:
427
+ task: Task dictionary.
428
+
429
+ Returns:
430
+ Category string, or None if no category field is found.
431
+ """
432
+ for key in ("category", "repo", "difficulty"):
433
+ value = task.get(key)
434
+ if value is not None:
435
+ return str(value)
436
+ return None
437
+
438
+
439
+ def create_scheduler(preset: str, **kwargs: Any) -> TaskScheduler:
440
+ """Create a TaskScheduler from a preset name.
441
+
442
+ Convenience factory function that maps human-readable preset names to
443
+ scheduling strategies.
444
+
445
+ Args:
446
+ preset: One of ``"default"``, ``"speed"``, ``"cost"``, ``"coverage"``.
447
+ **kwargs: Additional keyword arguments passed to TaskScheduler
448
+ (e.g., ``custom_scorer``).
449
+
450
+ Returns:
451
+ Configured TaskScheduler instance.
452
+
453
+ Raises:
454
+ ValueError: If the preset name is not recognized.
455
+ """
456
+ preset_map: dict[str, SchedulingStrategy] = {
457
+ "default": SchedulingStrategy.DEFAULT,
458
+ "speed": SchedulingStrategy.SPEED_FIRST,
459
+ "cost": SchedulingStrategy.COST_FIRST,
460
+ "coverage": SchedulingStrategy.COVERAGE_FIRST,
461
+ }
462
+
463
+ strategy = preset_map.get(preset.lower())
464
+ if strategy is None:
465
+ valid_presets = ", ".join(sorted(preset_map.keys()))
466
+ raise ValueError(f"Unknown scheduling preset: '{preset}'. Valid presets: {valid_presets}")
467
+
468
+ return TaskScheduler(strategy=strategy, **kwargs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcpbr
3
- Version: 0.4.16
3
+ Version: 0.6.0
4
4
  Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
5
5
  Project-URL: Homepage, https://github.com/greynewell/mcpbr
6
6
  Project-URL: Repository, https://github.com/greynewell/mcpbr
@@ -30,6 +30,9 @@ Requires-Dist: pydantic>=2.0.0
30
30
  Requires-Dist: pyyaml>=6.0.0
31
31
  Requires-Dist: requests>=2.31.0
32
32
  Requires-Dist: rich>=13.0.0
33
+ Provides-Extra: all-providers
34
+ Requires-Dist: google-generativeai>=0.3.0; extra == 'all-providers'
35
+ Requires-Dist: openai>=1.0.0; extra == 'all-providers'
33
36
  Provides-Extra: dev
34
37
  Requires-Dist: pre-commit>=3.0.0; extra == 'dev'
35
38
  Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
@@ -40,6 +43,10 @@ Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
40
43
  Requires-Dist: mkdocs-minify-plugin>=0.7.0; extra == 'docs'
41
44
  Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
42
45
  Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
46
+ Provides-Extra: gemini
47
+ Requires-Dist: google-generativeai>=0.3.0; extra == 'gemini'
48
+ Provides-Extra: openai
49
+ Requires-Dist: openai>=1.0.0; extra == 'openai'
43
50
  Description-Content-Type: text/markdown
44
51
 
45
52
  # mcpbr
@@ -1,40 +1,56 @@
1
- mcpbr/__init__.py,sha256=3vhpKV9kVECjuPapKpCPEHTjlOsyhuoiLZxBv9O1eL0,152
1
+ mcpbr/__init__.py,sha256=uTZA0md1SKeI-aFAFCGdnfYD8NVOI70BQhxNN_EsyxA,447
2
2
  mcpbr/__main__.py,sha256=WmeQsAqtW_9tMTNKArH1m76DPBokZpXuy6dMZp13gXA,132
3
3
  mcpbr/agent.py,sha256=aSFH2S3ExKZfdVfMbzk6D1nRhpKt4JmpRzmF4Vi6Gmo,5795
4
4
  mcpbr/cache.py,sha256=YiP13omwMbXLb6NhNocJvL58enXEx9J8OrvTZnWUkw4,13254
5
5
  mcpbr/cli.py,sha256=xvh7gpJx0LzjV3g-Te4FF7BfHubGzDxOiYQsSeQnCEc,68276
6
- mcpbr/config.py,sha256=7lWV0ZtzyD6WZ07IR4yhT9lyBBPONzlanaO4XHm9OoE,18952
6
+ mcpbr/config.py,sha256=KEuOzo-hhGwbejljlUpQn7gWC42Y5doe07hqeLwQIYA,20171
7
7
  mcpbr/config_inheritance.py,sha256=0EV9Tv62UFNgZoc8mY7yYjHEbnMM_R5EAhSeuK7ajAA,6617
8
+ mcpbr/config_migration.py,sha256=vTs52uYLO0DusB07nHZT2Y27-_eZdZKhaXYWhDFcnJI,16098
8
9
  mcpbr/config_validator.py,sha256=ZMEIeK4y6fSwyY46Xv5dK5v3jM4HDKcYkosnIcn7iyI,20488
10
+ mcpbr/config_wizard.py,sha256=IPk5a2xI_6CTvo9tIDe2MLGewbc_iVYg3k_NOTC6A6c,22522
9
11
  mcpbr/custom_metrics.py,sha256=4pMO9-BPpeQ_GUTnZ18TQXINFScAMH3cIYm0HG-C51o,13213
12
+ mcpbr/dashboard.py,sha256=wt2A-yFgDvQc94wgPPJlz70gFAkyUi41xgfqPL9xRQY,21884
13
+ mcpbr/dataset_streaming.py,sha256=XwQSdvy97yurlcAC5hUwto8bLuCf2A9FSMcwjTD_Tho,16720
10
14
  mcpbr/dataset_versioning.py,sha256=Y_ZSGhl8ihl6Kgee_p7VbkNwGhgwIdMZPlRunvk4knY,7149
11
- mcpbr/docker_env.py,sha256=_45OUZKjUevE9O3YLF_1uvQtdOyJ7yZIYWmSvXN3cFw,31794
15
+ mcpbr/docker_cache.py,sha256=jn_9Ak2d8omNmedSCBwA7wrswtEQvB-Bu8TIP2cm-F0,18704
16
+ mcpbr/docker_env.py,sha256=EQmVfQNyvWmmMErRJeEpsvJp1dNiLyIG2EDAmjZVMTI,31795
17
+ mcpbr/docker_prewarm.py,sha256=DCB5-h-Uu7KjsfMCGsZOTpTjLhshMQ8QfpbQh-T1PxU,12639
18
+ mcpbr/dry_run.py,sha256=rlohv2QjtaFvLeJ7JOAPIH95GGOFoenFLcmBk-xX8x4,18219
12
19
  mcpbr/env_expansion.py,sha256=Rkhth-tWV8CptQlSSk9exuMsUaSTTW9hj69z4snZd_U,6122
13
20
  mcpbr/evaluation.py,sha256=EjPREWv7hBRqhBhNan0ERh2imqMBegT0Y2cgZlTxRGk,12765
14
21
  mcpbr/failure_analysis.py,sha256=N5xp9YPe2d7P9fTa2LVSHsPgB1WOQtWMeClq3bOv4_c,19883
15
22
  mcpbr/few_shot.py,sha256=bFDdes_kgZAFWoFZQEfZG5Z2Es9rmkB1jsxSMp4aCCM,11684
16
- mcpbr/gpu_support.py,sha256=eroBiLkt1A3Q2ODJDSyqrd_BzcMh8tFkjtPn7PsvJJc,5070
17
- mcpbr/harness.py,sha256=8-qmcPR2CDFuoBib9g6lPx7aMOK-5PuZgpWhpGs-Ils,51419
23
+ mcpbr/formatting.py,sha256=lwZcb4fD5osBzJlerICyvAVb4KHSm_nRTBg1dVfD6Lo,14193
24
+ mcpbr/gpu_support.py,sha256=6f_t_bGoADnu0uPsGDwSWYD7xHN_e553x1J6ceQUHi4,5071
25
+ mcpbr/graceful_degradation.py,sha256=QPFQ0BCLERvH4e68rQkePHUW9BDJmXttlIQaqCOP5gw,8992
26
+ mcpbr/harness.py,sha256=Rc6CqzZOMJyuHqfuOIDisLOoPka-cqAqYiL7zr7ALFg,53193
18
27
  mcpbr/harnesses.py,sha256=h9iDp4qkPABNwO9OXbJ61qcD4n0oAUTU7AQksxRKLcg,47335
19
28
  mcpbr/incremental_save.py,sha256=1dm3pGiEIhP8cVk_Y6XF_cAdo3B_vyRc6CO8Wt-MyIA,4830
20
29
  mcpbr/junit_reporter.py,sha256=M_02zJbFbA3VoIYG5oR7VDecqWHEpIee-JOUShWNuLU,9261
30
+ mcpbr/languages.py,sha256=LrWjEsaH9nl37LZtRxPwC8J3zETMbbcxRI16gjMx2iw,8087
21
31
  mcpbr/latency_metrics.py,sha256=xNMaUzGMSbOIfuoyZGyIfyMk5uAmoj6K65ZAs5D6Z8c,10476
22
32
  mcpbr/log_formatter.py,sha256=d2jWH7z4IRSbr8-PbnEt3TmLAqk8vgdPT38uTnTCN5c,21488
23
- mcpbr/models.py,sha256=zsrBrwFeOfNKgThUbT1oPkF5pdRjL1QJjMte0vXjcbk,3710
33
+ mcpbr/logging_config.py,sha256=0Q5qNhT_Pz1mYwj1rRENuDlIdiHBBNN7N-cSWAg5B6M,6823
34
+ mcpbr/models.py,sha256=epRVhs1ryGO74ehDRppArUfeOTxnHYyctEo0fD8Dwvw,5698
24
35
  mcpbr/output_validator.py,sha256=TUoBtDjjXvR6MACbWV6uNOsxM_n4C0Jbn5in35HH4K8,1750
25
- mcpbr/preflight.py,sha256=UE185nnXqXP1hfVDia0sPHfR2pMXohBkbAPuZfDG_Z0,8914
26
- mcpbr/pricing.py,sha256=WFnVS3L8z9PSEXuEkGWFdeSJvmnUKqaQGE6-AgUEzdA,7214
36
+ mcpbr/preflight.py,sha256=owWPOCEIN8vSY1Z2NB1T9ow7UzY6O6mLh5KPK1ZxEGo,8915
37
+ mcpbr/pricing.py,sha256=L-HE-C8LWCQ2iWEqWHkBtyLzPPBdr05bVaCmEy9EQsw,9693
27
38
  mcpbr/profiler.py,sha256=SRXLKf2TOlpnMbQpGvjRy1Agv-XaEz6lDmBa5WGNv8c,15954
28
- mcpbr/providers.py,sha256=ebrnH6RXODxX4Ma9r7Is5VBHYFNP5LwCs-vpLbbHP8o,6598
39
+ mcpbr/providers.py,sha256=p8gsPbcM9HZjSFztCbBUBvbWqc0hA_lprNh-vtDwDiE,16721
29
40
  mcpbr/regression.py,sha256=xm_ago8ZP3RAOrDNjtINwyRUvzKWJcJDWbzf3hp6LlU,12827
30
41
  mcpbr/reporting.py,sha256=Odzb7EgpimW-qh01VQedhb2X594ACrOcGe4jshgiwTg,56111
42
+ mcpbr/resource_limits.py,sha256=MnmKcJLPCD-cyydGyiZz1TOPsQSpmdsmrbAvYWlxM5s,16594
43
+ mcpbr/result_streaming.py,sha256=uXKCPNSXhBd8VV-Sc5aldC7zSM1MG8pAiMgsMP7ZZOA,16602
31
44
  mcpbr/sampling.py,sha256=Hpgh2TayI3QGcno-Np9eYi8sklxKEZQXyhpaQlc9T4Q,6248
32
45
  mcpbr/schema.py,sha256=fdjiKmp1au2oN5aXcPRoCbyvwm2XeMD5DmeWSurMk4A,6858
33
- mcpbr/smoke_test.py,sha256=srYGOn_auspRbt_a6ebYDDDq_nujA_iZGman5nU1ikU,14925
46
+ mcpbr/sdk.py,sha256=uxX7lHzR0SkZe3lmTT7zi_C0nPPQY5wGtN1USIGf-8M,8772
47
+ mcpbr/smoke_test.py,sha256=SCWR26kVmHZj-smWziwMh24UIO_MtIaik-REuHZ0pbc,14926
34
48
  mcpbr/state_tracker.py,sha256=rIP9LIHtQg6oBsLIxnwRjE865Kw6U7DMO_GzzuMRC0E,10790
35
49
  mcpbr/statistics.py,sha256=Ny8TMdBrIpS4KfKCJcuFfTeaGuTmEkS1G_uHBlboYdA,19134
36
50
  mcpbr/streaming.py,sha256=XPhkXO1R1EsWtkoPvCpyy4TehEom7hkuOeP-00joX3o,13853
37
51
  mcpbr/swebench_test_specs.py,sha256=Mh_BPjcexkgDT3p4zT2p31925b8w5tgsxxRpYZQZalM,1390
52
+ mcpbr/task_batching.py,sha256=KFw_RxZUvVgjRr3pc0ehBX2GAoPT5yTP4VdM43e1_Dk,15275
53
+ mcpbr/task_scheduler.py,sha256=HBR8ebJJ-ZDzM2nnWaIHhOqdbdqRjYzbEVGYjGiVYEw,17183
38
54
  mcpbr/templates.py,sha256=dqwboVB-yfE06w2rgDOvuWJB4Hx5duH_W-jvLBqmlKg,10683
39
55
  mcpbr/benchmarks/__init__.py,sha256=2-7Ebg6-wHo1QGfVKWjjbREcLG_A-6Q0XfZGiyXrOeE,4489
40
56
  mcpbr/benchmarks/adversarial.py,sha256=69VBTZv6BhR1JwjQepA_YwAu3b--vJviGd6IWs2h1QA,12357
@@ -80,15 +96,15 @@ mcpbr/infrastructure/azure_health.py,sha256=xITmIa9IfYIwxcVhY0sJ81a-6WNKiT8kSQTd
80
96
  mcpbr/infrastructure/base.py,sha256=Olj6uiNBeGoUqltZI1NHZfa26kzT-6jfp8YIXSykFKM,3037
81
97
  mcpbr/infrastructure/local.py,sha256=VK6UAg7Dzvb9v1LAJgNGA_s0blQKrHAQEXBAC75zAL8,4237
82
98
  mcpbr/infrastructure/manager.py,sha256=j0T7U1Tbajmfve4SNfhYKikvL9kgSVT01fYKMC-sH-s,4796
83
- mcpbr-0.4.16.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
84
- mcpbr-0.4.16.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
85
- mcpbr-0.4.16.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
86
- mcpbr-0.4.16.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
87
- mcpbr-0.4.16.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
88
- mcpbr-0.4.16.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
89
- mcpbr-0.4.16.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
90
- mcpbr-0.4.16.dist-info/METADATA,sha256=GeSnMZw0x7-XPhblIu50aCO7NXaNfjgVScnBOp6ZaOA,55069
91
- mcpbr-0.4.16.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
92
- mcpbr-0.4.16.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
93
- mcpbr-0.4.16.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
94
- mcpbr-0.4.16.dist-info/RECORD,,
99
+ mcpbr-0.6.0.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
100
+ mcpbr-0.6.0.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
101
+ mcpbr-0.6.0.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
102
+ mcpbr-0.6.0.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
103
+ mcpbr-0.6.0.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
104
+ mcpbr-0.6.0.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
105
+ mcpbr-0.6.0.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
106
+ mcpbr-0.6.0.dist-info/METADATA,sha256=sNd6FYrOQ1tgz6bum5D03w373dEJfXODVLfmLsiy5hk,55376
107
+ mcpbr-0.6.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
108
+ mcpbr-0.6.0.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
109
+ mcpbr-0.6.0.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
110
+ mcpbr-0.6.0.dist-info/RECORD,,
File without changes