mcpbr 0.4.16__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/__init__.py +20 -1
- mcpbr/config.py +37 -1
- mcpbr/config_migration.py +470 -0
- mcpbr/config_wizard.py +647 -0
- mcpbr/dashboard.py +619 -0
- mcpbr/dataset_streaming.py +491 -0
- mcpbr/docker_cache.py +539 -0
- mcpbr/docker_env.py +2 -1
- mcpbr/docker_prewarm.py +370 -0
- mcpbr/dry_run.py +533 -0
- mcpbr/formatting.py +444 -0
- mcpbr/gpu_support.py +2 -1
- mcpbr/graceful_degradation.py +277 -0
- mcpbr/harness.py +38 -4
- mcpbr/languages.py +228 -0
- mcpbr/logging_config.py +207 -0
- mcpbr/models.py +66 -0
- mcpbr/preflight.py +2 -1
- mcpbr/pricing.py +72 -0
- mcpbr/providers.py +316 -3
- mcpbr/resource_limits.py +487 -0
- mcpbr/result_streaming.py +519 -0
- mcpbr/sdk.py +264 -0
- mcpbr/smoke_test.py +2 -1
- mcpbr/task_batching.py +403 -0
- mcpbr/task_scheduler.py +468 -0
- {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
- {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/RECORD +38 -22
- {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
- {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/task_scheduler.py
ADDED
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
"""Task prioritization and scheduling for benchmark evaluations.
|
|
2
|
+
|
|
3
|
+
Provides intelligent task ordering strategies to optimize benchmark runs
|
|
4
|
+
for speed, cost, coverage diversity, or custom scoring functions. Tasks
|
|
5
|
+
can be reordered before execution to get faster feedback, reduce costs,
|
|
6
|
+
or ensure diverse coverage across repositories and categories.
|
|
7
|
+
|
|
8
|
+
Addresses GitHub issue #92: Task Prioritization and Scheduling.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from enum import Enum
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SchedulingStrategy(Enum):
|
|
19
|
+
"""Strategy for ordering benchmark tasks before execution.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
DEFAULT: Preserve original task order (no reordering).
|
|
23
|
+
SPEED_FIRST: Run fastest tasks first for quick feedback.
|
|
24
|
+
COST_FIRST: Run cheapest tasks first to minimize early spend.
|
|
25
|
+
COVERAGE_FIRST: Round-robin across categories/repos for diverse early results.
|
|
26
|
+
CUSTOM: Use a user-provided scoring function.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
DEFAULT = "default"
|
|
30
|
+
SPEED_FIRST = "speed"
|
|
31
|
+
COST_FIRST = "cost"
|
|
32
|
+
COVERAGE_FIRST = "coverage"
|
|
33
|
+
CUSTOM = "custom"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class TaskPriority:
|
|
38
|
+
"""Priority metadata for a single benchmark task.
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
task_id: Unique identifier for the task (e.g., instance_id).
|
|
42
|
+
priority_score: Computed priority score (lower = higher priority / runs first).
|
|
43
|
+
estimated_time_seconds: Rough estimate of task execution time in seconds.
|
|
44
|
+
estimated_cost_usd: Rough estimate of task cost in USD.
|
|
45
|
+
category: Category or grouping key (e.g., repo name, difficulty level).
|
|
46
|
+
metadata: Additional metadata associated with the task.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
task_id: str
|
|
50
|
+
priority_score: float = 0.0
|
|
51
|
+
estimated_time_seconds: float | None = None
|
|
52
|
+
estimated_cost_usd: float | None = None
|
|
53
|
+
category: str | None = None
|
|
54
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# Default token-per-character ratio for cost estimation.
|
|
58
|
+
# Based on empirical observation that ~4 characters is roughly 1 token for English text.
|
|
59
|
+
_DEFAULT_CHARS_PER_TOKEN = 4
|
|
60
|
+
|
|
61
|
+
# Default assumed output-to-input token ratio for agent tasks.
|
|
62
|
+
# Agents typically produce 3-5x more output than input for coding tasks.
|
|
63
|
+
_DEFAULT_OUTPUT_INPUT_RATIO = 4.0
|
|
64
|
+
|
|
65
|
+
# Baseline seconds per 1000 characters of problem statement.
|
|
66
|
+
# Longer problems tend to require more exploration and tool calls.
|
|
67
|
+
_DEFAULT_SECONDS_PER_KCHAR = 30.0
|
|
68
|
+
|
|
69
|
+
# Minimum estimated time for any task (seconds).
|
|
70
|
+
_MIN_ESTIMATED_TIME = 30.0
|
|
71
|
+
|
|
72
|
+
# Minimum estimated cost for any task (USD).
|
|
73
|
+
_MIN_ESTIMATED_COST = 0.001
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class TaskScheduler:
|
|
77
|
+
"""Scheduler that reorders benchmark tasks based on a chosen strategy.
|
|
78
|
+
|
|
79
|
+
The scheduler assigns priority scores to tasks and returns them in
|
|
80
|
+
sorted order. It supports preset strategies (speed, cost, coverage)
|
|
81
|
+
and custom scoring functions.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
strategy: The scheduling strategy to use.
|
|
85
|
+
custom_scorer: A callable that takes a task dict and returns a float
|
|
86
|
+
priority score (lower = runs first). Required when strategy is CUSTOM.
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
ValueError: If strategy is CUSTOM but no custom_scorer is provided.
|
|
90
|
+
|
|
91
|
+
Example:
|
|
92
|
+
>>> scheduler = TaskScheduler(strategy=SchedulingStrategy.SPEED_FIRST)
|
|
93
|
+
>>> ordered = scheduler.schedule(tasks)
|
|
94
|
+
>>> print(scheduler.preview(tasks))
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
strategy: SchedulingStrategy = SchedulingStrategy.DEFAULT,
|
|
100
|
+
custom_scorer: Callable[[dict[str, Any]], float] | None = None,
|
|
101
|
+
) -> None:
|
|
102
|
+
if strategy == SchedulingStrategy.CUSTOM and custom_scorer is None:
|
|
103
|
+
raise ValueError(
|
|
104
|
+
"custom_scorer is required when strategy is CUSTOM. "
|
|
105
|
+
"Provide a callable that takes a task dict and returns a float score."
|
|
106
|
+
)
|
|
107
|
+
self._strategy = strategy
|
|
108
|
+
self._custom_scorer = custom_scorer
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def strategy(self) -> SchedulingStrategy:
|
|
112
|
+
"""The active scheduling strategy."""
|
|
113
|
+
return self._strategy
|
|
114
|
+
|
|
115
|
+
def schedule(self, tasks: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
116
|
+
"""Reorder tasks according to the active scheduling strategy.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
tasks: List of task dictionaries to schedule. Each task should have
|
|
120
|
+
at least an ``instance_id`` key. Additional keys like
|
|
121
|
+
``problem_statement``, ``repo``, and ``category`` improve
|
|
122
|
+
estimation accuracy.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
A new list of task dictionaries in the scheduled order.
|
|
126
|
+
The original list is not mutated.
|
|
127
|
+
"""
|
|
128
|
+
if not tasks:
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
if self._strategy == SchedulingStrategy.DEFAULT:
|
|
132
|
+
return list(tasks)
|
|
133
|
+
|
|
134
|
+
priorities = self._compute_priorities(tasks)
|
|
135
|
+
|
|
136
|
+
# Sort by priority_score ascending (lower = runs first)
|
|
137
|
+
priorities.sort(key=lambda p: p.priority_score)
|
|
138
|
+
|
|
139
|
+
# Build task lookup by id for efficient reordering
|
|
140
|
+
task_by_id: dict[str, dict[str, Any]] = {}
|
|
141
|
+
for task in tasks:
|
|
142
|
+
tid = task.get("instance_id", str(id(task)))
|
|
143
|
+
task_by_id[tid] = task
|
|
144
|
+
|
|
145
|
+
return [task_by_id[p.task_id] for p in priorities]
|
|
146
|
+
|
|
147
|
+
def preview(self, tasks: list[dict[str, Any]]) -> str:
|
|
148
|
+
"""Generate a human-readable preview of the scheduled task order.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
tasks: List of task dictionaries to preview.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
A formatted string showing the scheduled order with priority
|
|
155
|
+
details, suitable for display before execution.
|
|
156
|
+
"""
|
|
157
|
+
if not tasks:
|
|
158
|
+
return "No tasks to schedule."
|
|
159
|
+
|
|
160
|
+
scheduled = self.schedule(tasks)
|
|
161
|
+
priorities = self._compute_priorities(tasks)
|
|
162
|
+
priorities.sort(key=lambda p: p.priority_score)
|
|
163
|
+
|
|
164
|
+
# Build a lookup for priority info
|
|
165
|
+
priority_by_id: dict[str, TaskPriority] = {p.task_id: p for p in priorities}
|
|
166
|
+
|
|
167
|
+
lines: list[str] = []
|
|
168
|
+
lines.append(f"Schedule Preview (strategy: {self._strategy.value})")
|
|
169
|
+
lines.append(f"Total tasks: {len(scheduled)}")
|
|
170
|
+
lines.append("-" * 70)
|
|
171
|
+
lines.append(f"{'#':<4} {'Task ID':<35} {'Score':<8} {'Est. Time':<12} {'Est. Cost':<10}")
|
|
172
|
+
lines.append("-" * 70)
|
|
173
|
+
|
|
174
|
+
for i, task in enumerate(scheduled, start=1):
|
|
175
|
+
tid = task.get("instance_id", str(id(task)))
|
|
176
|
+
priority = priority_by_id.get(tid)
|
|
177
|
+
|
|
178
|
+
if priority is not None:
|
|
179
|
+
score_str = f"{priority.priority_score:.2f}"
|
|
180
|
+
time_str = (
|
|
181
|
+
f"{priority.estimated_time_seconds:.0f}s"
|
|
182
|
+
if priority.estimated_time_seconds is not None
|
|
183
|
+
else "N/A"
|
|
184
|
+
)
|
|
185
|
+
cost_str = (
|
|
186
|
+
f"${priority.estimated_cost_usd:.4f}"
|
|
187
|
+
if priority.estimated_cost_usd is not None
|
|
188
|
+
else "N/A"
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
score_str = "N/A"
|
|
192
|
+
time_str = "N/A"
|
|
193
|
+
cost_str = "N/A"
|
|
194
|
+
|
|
195
|
+
lines.append(f"{i:<4} {tid:<35} {score_str:<8} {time_str:<12} {cost_str:<10}")
|
|
196
|
+
|
|
197
|
+
lines.append("-" * 70)
|
|
198
|
+
return "\n".join(lines)
|
|
199
|
+
|
|
200
|
+
def estimate_task_cost(self, task: dict[str, Any], model: str = "sonnet") -> float:
|
|
201
|
+
"""Estimate the cost of running a single task in USD.
|
|
202
|
+
|
|
203
|
+
The estimate is based on the length of the problem statement and
|
|
204
|
+
the model's pricing. Longer problems produce more tokens and cost more.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
task: Task dictionary, ideally containing a ``problem_statement`` key.
|
|
208
|
+
model: Model identifier used for pricing lookup (default: ``"sonnet"``).
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Estimated cost in USD. Returns ``_MIN_ESTIMATED_COST`` if pricing
|
|
212
|
+
data is unavailable or the problem statement is missing.
|
|
213
|
+
"""
|
|
214
|
+
from .pricing import get_model_pricing
|
|
215
|
+
|
|
216
|
+
problem = task.get("problem_statement", "")
|
|
217
|
+
problem_len = len(problem) if isinstance(problem, str) else 0
|
|
218
|
+
|
|
219
|
+
pricing = get_model_pricing(model)
|
|
220
|
+
if pricing is None:
|
|
221
|
+
return _MIN_ESTIMATED_COST
|
|
222
|
+
|
|
223
|
+
# Estimate input tokens from problem length
|
|
224
|
+
input_tokens = max(problem_len / _DEFAULT_CHARS_PER_TOKEN, 100)
|
|
225
|
+
|
|
226
|
+
# Estimate output tokens as a multiple of input
|
|
227
|
+
output_tokens = input_tokens * _DEFAULT_OUTPUT_INPUT_RATIO
|
|
228
|
+
|
|
229
|
+
# Calculate cost in USD
|
|
230
|
+
input_cost = (input_tokens / 1_000_000) * pricing.input_price_per_mtok
|
|
231
|
+
output_cost = (output_tokens / 1_000_000) * pricing.output_price_per_mtok
|
|
232
|
+
|
|
233
|
+
return max(input_cost + output_cost, _MIN_ESTIMATED_COST)
|
|
234
|
+
|
|
235
|
+
def estimate_task_time(self, task: dict[str, Any]) -> float:
|
|
236
|
+
"""Estimate the execution time for a single task in seconds.
|
|
237
|
+
|
|
238
|
+
The estimate is based on the length of the problem statement.
|
|
239
|
+
Longer problems typically require more exploration time.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
task: Task dictionary, ideally containing a ``problem_statement`` key.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
Estimated execution time in seconds (minimum ``_MIN_ESTIMATED_TIME``).
|
|
246
|
+
"""
|
|
247
|
+
problem = task.get("problem_statement", "")
|
|
248
|
+
problem_len = len(problem) if isinstance(problem, str) else 0
|
|
249
|
+
|
|
250
|
+
# Scale linearly with problem length
|
|
251
|
+
estimated = (problem_len / 1000) * _DEFAULT_SECONDS_PER_KCHAR
|
|
252
|
+
|
|
253
|
+
return max(estimated, _MIN_ESTIMATED_TIME)
|
|
254
|
+
|
|
255
|
+
def _compute_priorities(self, tasks: list[dict[str, Any]]) -> list[TaskPriority]:
|
|
256
|
+
"""Compute priority scores for all tasks based on the active strategy.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
tasks: List of task dictionaries.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
List of TaskPriority objects with computed scores.
|
|
263
|
+
"""
|
|
264
|
+
if self._strategy == SchedulingStrategy.SPEED_FIRST:
|
|
265
|
+
return self._prioritize_by_speed(tasks)
|
|
266
|
+
elif self._strategy == SchedulingStrategy.COST_FIRST:
|
|
267
|
+
return self._prioritize_by_cost(tasks)
|
|
268
|
+
elif self._strategy == SchedulingStrategy.COVERAGE_FIRST:
|
|
269
|
+
return self._prioritize_by_coverage(tasks)
|
|
270
|
+
elif self._strategy == SchedulingStrategy.CUSTOM:
|
|
271
|
+
return self._prioritize_by_custom(tasks)
|
|
272
|
+
else:
|
|
273
|
+
# DEFAULT: preserve original order via index-based scoring
|
|
274
|
+
return [
|
|
275
|
+
TaskPriority(
|
|
276
|
+
task_id=task.get("instance_id", str(id(task))),
|
|
277
|
+
priority_score=float(i),
|
|
278
|
+
)
|
|
279
|
+
for i, task in enumerate(tasks)
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
def _prioritize_by_speed(self, tasks: list[dict[str, Any]]) -> list[TaskPriority]:
|
|
283
|
+
"""Assign priority scores based on estimated execution time (ascending).
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
tasks: List of task dictionaries.
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
List of TaskPriority objects scored by estimated time.
|
|
290
|
+
"""
|
|
291
|
+
priorities: list[TaskPriority] = []
|
|
292
|
+
for task in tasks:
|
|
293
|
+
tid = task.get("instance_id", str(id(task)))
|
|
294
|
+
est_time = self.estimate_task_time(task)
|
|
295
|
+
est_cost = self.estimate_task_cost(task)
|
|
296
|
+
category = _extract_category(task)
|
|
297
|
+
|
|
298
|
+
priorities.append(
|
|
299
|
+
TaskPriority(
|
|
300
|
+
task_id=tid,
|
|
301
|
+
priority_score=est_time,
|
|
302
|
+
estimated_time_seconds=est_time,
|
|
303
|
+
estimated_cost_usd=est_cost,
|
|
304
|
+
category=category,
|
|
305
|
+
)
|
|
306
|
+
)
|
|
307
|
+
return priorities
|
|
308
|
+
|
|
309
|
+
def _prioritize_by_cost(self, tasks: list[dict[str, Any]]) -> list[TaskPriority]:
|
|
310
|
+
"""Assign priority scores based on estimated cost (ascending).
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
tasks: List of task dictionaries.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
List of TaskPriority objects scored by estimated cost.
|
|
317
|
+
"""
|
|
318
|
+
priorities: list[TaskPriority] = []
|
|
319
|
+
for task in tasks:
|
|
320
|
+
tid = task.get("instance_id", str(id(task)))
|
|
321
|
+
est_time = self.estimate_task_time(task)
|
|
322
|
+
est_cost = self.estimate_task_cost(task)
|
|
323
|
+
category = _extract_category(task)
|
|
324
|
+
|
|
325
|
+
priorities.append(
|
|
326
|
+
TaskPriority(
|
|
327
|
+
task_id=tid,
|
|
328
|
+
priority_score=est_cost,
|
|
329
|
+
estimated_time_seconds=est_time,
|
|
330
|
+
estimated_cost_usd=est_cost,
|
|
331
|
+
category=category,
|
|
332
|
+
)
|
|
333
|
+
)
|
|
334
|
+
return priorities
|
|
335
|
+
|
|
336
|
+
def _prioritize_by_coverage(self, tasks: list[dict[str, Any]]) -> list[TaskPriority]:
|
|
337
|
+
"""Assign priority scores using round-robin across categories.
|
|
338
|
+
|
|
339
|
+
Tasks are grouped by category (repo, difficulty, or explicit category),
|
|
340
|
+
then interleaved so that early execution covers diverse categories.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
tasks: List of task dictionaries.
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
List of TaskPriority objects with interleaved category ordering.
|
|
347
|
+
"""
|
|
348
|
+
# Group tasks by category
|
|
349
|
+
groups: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
350
|
+
for task in tasks:
|
|
351
|
+
category = _extract_category(task) or "_uncategorized_"
|
|
352
|
+
groups[category].append(task)
|
|
353
|
+
|
|
354
|
+
# Sort group keys for deterministic ordering
|
|
355
|
+
sorted_keys = sorted(groups.keys())
|
|
356
|
+
|
|
357
|
+
# Round-robin interleave: take one task from each category in turn
|
|
358
|
+
result: list[TaskPriority] = []
|
|
359
|
+
score = 0.0
|
|
360
|
+
max_group_len = max(len(g) for g in groups.values()) if groups else 0
|
|
361
|
+
|
|
362
|
+
for round_idx in range(max_group_len):
|
|
363
|
+
for key in sorted_keys:
|
|
364
|
+
group = groups[key]
|
|
365
|
+
if round_idx < len(group):
|
|
366
|
+
task = group[round_idx]
|
|
367
|
+
tid = task.get("instance_id", str(id(task)))
|
|
368
|
+
est_time = self.estimate_task_time(task)
|
|
369
|
+
est_cost = self.estimate_task_cost(task)
|
|
370
|
+
|
|
371
|
+
result.append(
|
|
372
|
+
TaskPriority(
|
|
373
|
+
task_id=tid,
|
|
374
|
+
priority_score=score,
|
|
375
|
+
estimated_time_seconds=est_time,
|
|
376
|
+
estimated_cost_usd=est_cost,
|
|
377
|
+
category=key,
|
|
378
|
+
)
|
|
379
|
+
)
|
|
380
|
+
score += 1.0
|
|
381
|
+
|
|
382
|
+
return result
|
|
383
|
+
|
|
384
|
+
def _prioritize_by_custom(self, tasks: list[dict[str, Any]]) -> list[TaskPriority]:
|
|
385
|
+
"""Assign priority scores using the user-provided custom scorer.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
tasks: List of task dictionaries.
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
List of TaskPriority objects scored by the custom function.
|
|
392
|
+
|
|
393
|
+
Raises:
|
|
394
|
+
RuntimeError: If custom_scorer is None (should not happen due to
|
|
395
|
+
__init__ validation).
|
|
396
|
+
"""
|
|
397
|
+
if self._custom_scorer is None:
|
|
398
|
+
raise RuntimeError("custom_scorer is None but strategy is CUSTOM")
|
|
399
|
+
|
|
400
|
+
priorities: list[TaskPriority] = []
|
|
401
|
+
for task in tasks:
|
|
402
|
+
tid = task.get("instance_id", str(id(task)))
|
|
403
|
+
score = self._custom_scorer(task)
|
|
404
|
+
est_time = self.estimate_task_time(task)
|
|
405
|
+
est_cost = self.estimate_task_cost(task)
|
|
406
|
+
category = _extract_category(task)
|
|
407
|
+
|
|
408
|
+
priorities.append(
|
|
409
|
+
TaskPriority(
|
|
410
|
+
task_id=tid,
|
|
411
|
+
priority_score=score,
|
|
412
|
+
estimated_time_seconds=est_time,
|
|
413
|
+
estimated_cost_usd=est_cost,
|
|
414
|
+
category=category,
|
|
415
|
+
)
|
|
416
|
+
)
|
|
417
|
+
return priorities
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _extract_category(task: dict[str, Any]) -> str | None:
|
|
421
|
+
"""Extract a category label from a task dictionary.
|
|
422
|
+
|
|
423
|
+
Checks common fields in order of preference: ``category``, ``repo``,
|
|
424
|
+
``difficulty``. Returns the first non-empty string found, or None.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
task: Task dictionary.
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
Category string, or None if no category field is found.
|
|
431
|
+
"""
|
|
432
|
+
for key in ("category", "repo", "difficulty"):
|
|
433
|
+
value = task.get(key)
|
|
434
|
+
if value is not None:
|
|
435
|
+
return str(value)
|
|
436
|
+
return None
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def create_scheduler(preset: str, **kwargs: Any) -> TaskScheduler:
|
|
440
|
+
"""Create a TaskScheduler from a preset name.
|
|
441
|
+
|
|
442
|
+
Convenience factory function that maps human-readable preset names to
|
|
443
|
+
scheduling strategies.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
preset: One of ``"default"``, ``"speed"``, ``"cost"``, ``"coverage"``.
|
|
447
|
+
**kwargs: Additional keyword arguments passed to TaskScheduler
|
|
448
|
+
(e.g., ``custom_scorer``).
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
Configured TaskScheduler instance.
|
|
452
|
+
|
|
453
|
+
Raises:
|
|
454
|
+
ValueError: If the preset name is not recognized.
|
|
455
|
+
"""
|
|
456
|
+
preset_map: dict[str, SchedulingStrategy] = {
|
|
457
|
+
"default": SchedulingStrategy.DEFAULT,
|
|
458
|
+
"speed": SchedulingStrategy.SPEED_FIRST,
|
|
459
|
+
"cost": SchedulingStrategy.COST_FIRST,
|
|
460
|
+
"coverage": SchedulingStrategy.COVERAGE_FIRST,
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
strategy = preset_map.get(preset.lower())
|
|
464
|
+
if strategy is None:
|
|
465
|
+
valid_presets = ", ".join(sorted(preset_map.keys()))
|
|
466
|
+
raise ValueError(f"Unknown scheduling preset: '{preset}'. Valid presets: {valid_presets}")
|
|
467
|
+
|
|
468
|
+
return TaskScheduler(strategy=strategy, **kwargs)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcpbr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
|
|
5
5
|
Project-URL: Homepage, https://github.com/greynewell/mcpbr
|
|
6
6
|
Project-URL: Repository, https://github.com/greynewell/mcpbr
|
|
@@ -30,6 +30,9 @@ Requires-Dist: pydantic>=2.0.0
|
|
|
30
30
|
Requires-Dist: pyyaml>=6.0.0
|
|
31
31
|
Requires-Dist: requests>=2.31.0
|
|
32
32
|
Requires-Dist: rich>=13.0.0
|
|
33
|
+
Provides-Extra: all-providers
|
|
34
|
+
Requires-Dist: google-generativeai>=0.3.0; extra == 'all-providers'
|
|
35
|
+
Requires-Dist: openai>=1.0.0; extra == 'all-providers'
|
|
33
36
|
Provides-Extra: dev
|
|
34
37
|
Requires-Dist: pre-commit>=3.0.0; extra == 'dev'
|
|
35
38
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
@@ -40,6 +43,10 @@ Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
|
|
|
40
43
|
Requires-Dist: mkdocs-minify-plugin>=0.7.0; extra == 'docs'
|
|
41
44
|
Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
|
|
42
45
|
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
|
|
46
|
+
Provides-Extra: gemini
|
|
47
|
+
Requires-Dist: google-generativeai>=0.3.0; extra == 'gemini'
|
|
48
|
+
Provides-Extra: openai
|
|
49
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
43
50
|
Description-Content-Type: text/markdown
|
|
44
51
|
|
|
45
52
|
# mcpbr
|
|
@@ -1,40 +1,56 @@
|
|
|
1
|
-
mcpbr/__init__.py,sha256=
|
|
1
|
+
mcpbr/__init__.py,sha256=uTZA0md1SKeI-aFAFCGdnfYD8NVOI70BQhxNN_EsyxA,447
|
|
2
2
|
mcpbr/__main__.py,sha256=WmeQsAqtW_9tMTNKArH1m76DPBokZpXuy6dMZp13gXA,132
|
|
3
3
|
mcpbr/agent.py,sha256=aSFH2S3ExKZfdVfMbzk6D1nRhpKt4JmpRzmF4Vi6Gmo,5795
|
|
4
4
|
mcpbr/cache.py,sha256=YiP13omwMbXLb6NhNocJvL58enXEx9J8OrvTZnWUkw4,13254
|
|
5
5
|
mcpbr/cli.py,sha256=xvh7gpJx0LzjV3g-Te4FF7BfHubGzDxOiYQsSeQnCEc,68276
|
|
6
|
-
mcpbr/config.py,sha256=
|
|
6
|
+
mcpbr/config.py,sha256=KEuOzo-hhGwbejljlUpQn7gWC42Y5doe07hqeLwQIYA,20171
|
|
7
7
|
mcpbr/config_inheritance.py,sha256=0EV9Tv62UFNgZoc8mY7yYjHEbnMM_R5EAhSeuK7ajAA,6617
|
|
8
|
+
mcpbr/config_migration.py,sha256=vTs52uYLO0DusB07nHZT2Y27-_eZdZKhaXYWhDFcnJI,16098
|
|
8
9
|
mcpbr/config_validator.py,sha256=ZMEIeK4y6fSwyY46Xv5dK5v3jM4HDKcYkosnIcn7iyI,20488
|
|
10
|
+
mcpbr/config_wizard.py,sha256=IPk5a2xI_6CTvo9tIDe2MLGewbc_iVYg3k_NOTC6A6c,22522
|
|
9
11
|
mcpbr/custom_metrics.py,sha256=4pMO9-BPpeQ_GUTnZ18TQXINFScAMH3cIYm0HG-C51o,13213
|
|
12
|
+
mcpbr/dashboard.py,sha256=wt2A-yFgDvQc94wgPPJlz70gFAkyUi41xgfqPL9xRQY,21884
|
|
13
|
+
mcpbr/dataset_streaming.py,sha256=XwQSdvy97yurlcAC5hUwto8bLuCf2A9FSMcwjTD_Tho,16720
|
|
10
14
|
mcpbr/dataset_versioning.py,sha256=Y_ZSGhl8ihl6Kgee_p7VbkNwGhgwIdMZPlRunvk4knY,7149
|
|
11
|
-
mcpbr/
|
|
15
|
+
mcpbr/docker_cache.py,sha256=jn_9Ak2d8omNmedSCBwA7wrswtEQvB-Bu8TIP2cm-F0,18704
|
|
16
|
+
mcpbr/docker_env.py,sha256=EQmVfQNyvWmmMErRJeEpsvJp1dNiLyIG2EDAmjZVMTI,31795
|
|
17
|
+
mcpbr/docker_prewarm.py,sha256=DCB5-h-Uu7KjsfMCGsZOTpTjLhshMQ8QfpbQh-T1PxU,12639
|
|
18
|
+
mcpbr/dry_run.py,sha256=rlohv2QjtaFvLeJ7JOAPIH95GGOFoenFLcmBk-xX8x4,18219
|
|
12
19
|
mcpbr/env_expansion.py,sha256=Rkhth-tWV8CptQlSSk9exuMsUaSTTW9hj69z4snZd_U,6122
|
|
13
20
|
mcpbr/evaluation.py,sha256=EjPREWv7hBRqhBhNan0ERh2imqMBegT0Y2cgZlTxRGk,12765
|
|
14
21
|
mcpbr/failure_analysis.py,sha256=N5xp9YPe2d7P9fTa2LVSHsPgB1WOQtWMeClq3bOv4_c,19883
|
|
15
22
|
mcpbr/few_shot.py,sha256=bFDdes_kgZAFWoFZQEfZG5Z2Es9rmkB1jsxSMp4aCCM,11684
|
|
16
|
-
mcpbr/
|
|
17
|
-
mcpbr/
|
|
23
|
+
mcpbr/formatting.py,sha256=lwZcb4fD5osBzJlerICyvAVb4KHSm_nRTBg1dVfD6Lo,14193
|
|
24
|
+
mcpbr/gpu_support.py,sha256=6f_t_bGoADnu0uPsGDwSWYD7xHN_e553x1J6ceQUHi4,5071
|
|
25
|
+
mcpbr/graceful_degradation.py,sha256=QPFQ0BCLERvH4e68rQkePHUW9BDJmXttlIQaqCOP5gw,8992
|
|
26
|
+
mcpbr/harness.py,sha256=Rc6CqzZOMJyuHqfuOIDisLOoPka-cqAqYiL7zr7ALFg,53193
|
|
18
27
|
mcpbr/harnesses.py,sha256=h9iDp4qkPABNwO9OXbJ61qcD4n0oAUTU7AQksxRKLcg,47335
|
|
19
28
|
mcpbr/incremental_save.py,sha256=1dm3pGiEIhP8cVk_Y6XF_cAdo3B_vyRc6CO8Wt-MyIA,4830
|
|
20
29
|
mcpbr/junit_reporter.py,sha256=M_02zJbFbA3VoIYG5oR7VDecqWHEpIee-JOUShWNuLU,9261
|
|
30
|
+
mcpbr/languages.py,sha256=LrWjEsaH9nl37LZtRxPwC8J3zETMbbcxRI16gjMx2iw,8087
|
|
21
31
|
mcpbr/latency_metrics.py,sha256=xNMaUzGMSbOIfuoyZGyIfyMk5uAmoj6K65ZAs5D6Z8c,10476
|
|
22
32
|
mcpbr/log_formatter.py,sha256=d2jWH7z4IRSbr8-PbnEt3TmLAqk8vgdPT38uTnTCN5c,21488
|
|
23
|
-
mcpbr/
|
|
33
|
+
mcpbr/logging_config.py,sha256=0Q5qNhT_Pz1mYwj1rRENuDlIdiHBBNN7N-cSWAg5B6M,6823
|
|
34
|
+
mcpbr/models.py,sha256=epRVhs1ryGO74ehDRppArUfeOTxnHYyctEo0fD8Dwvw,5698
|
|
24
35
|
mcpbr/output_validator.py,sha256=TUoBtDjjXvR6MACbWV6uNOsxM_n4C0Jbn5in35HH4K8,1750
|
|
25
|
-
mcpbr/preflight.py,sha256=
|
|
26
|
-
mcpbr/pricing.py,sha256=
|
|
36
|
+
mcpbr/preflight.py,sha256=owWPOCEIN8vSY1Z2NB1T9ow7UzY6O6mLh5KPK1ZxEGo,8915
|
|
37
|
+
mcpbr/pricing.py,sha256=L-HE-C8LWCQ2iWEqWHkBtyLzPPBdr05bVaCmEy9EQsw,9693
|
|
27
38
|
mcpbr/profiler.py,sha256=SRXLKf2TOlpnMbQpGvjRy1Agv-XaEz6lDmBa5WGNv8c,15954
|
|
28
|
-
mcpbr/providers.py,sha256=
|
|
39
|
+
mcpbr/providers.py,sha256=p8gsPbcM9HZjSFztCbBUBvbWqc0hA_lprNh-vtDwDiE,16721
|
|
29
40
|
mcpbr/regression.py,sha256=xm_ago8ZP3RAOrDNjtINwyRUvzKWJcJDWbzf3hp6LlU,12827
|
|
30
41
|
mcpbr/reporting.py,sha256=Odzb7EgpimW-qh01VQedhb2X594ACrOcGe4jshgiwTg,56111
|
|
42
|
+
mcpbr/resource_limits.py,sha256=MnmKcJLPCD-cyydGyiZz1TOPsQSpmdsmrbAvYWlxM5s,16594
|
|
43
|
+
mcpbr/result_streaming.py,sha256=uXKCPNSXhBd8VV-Sc5aldC7zSM1MG8pAiMgsMP7ZZOA,16602
|
|
31
44
|
mcpbr/sampling.py,sha256=Hpgh2TayI3QGcno-Np9eYi8sklxKEZQXyhpaQlc9T4Q,6248
|
|
32
45
|
mcpbr/schema.py,sha256=fdjiKmp1au2oN5aXcPRoCbyvwm2XeMD5DmeWSurMk4A,6858
|
|
33
|
-
mcpbr/
|
|
46
|
+
mcpbr/sdk.py,sha256=uxX7lHzR0SkZe3lmTT7zi_C0nPPQY5wGtN1USIGf-8M,8772
|
|
47
|
+
mcpbr/smoke_test.py,sha256=SCWR26kVmHZj-smWziwMh24UIO_MtIaik-REuHZ0pbc,14926
|
|
34
48
|
mcpbr/state_tracker.py,sha256=rIP9LIHtQg6oBsLIxnwRjE865Kw6U7DMO_GzzuMRC0E,10790
|
|
35
49
|
mcpbr/statistics.py,sha256=Ny8TMdBrIpS4KfKCJcuFfTeaGuTmEkS1G_uHBlboYdA,19134
|
|
36
50
|
mcpbr/streaming.py,sha256=XPhkXO1R1EsWtkoPvCpyy4TehEom7hkuOeP-00joX3o,13853
|
|
37
51
|
mcpbr/swebench_test_specs.py,sha256=Mh_BPjcexkgDT3p4zT2p31925b8w5tgsxxRpYZQZalM,1390
|
|
52
|
+
mcpbr/task_batching.py,sha256=KFw_RxZUvVgjRr3pc0ehBX2GAoPT5yTP4VdM43e1_Dk,15275
|
|
53
|
+
mcpbr/task_scheduler.py,sha256=HBR8ebJJ-ZDzM2nnWaIHhOqdbdqRjYzbEVGYjGiVYEw,17183
|
|
38
54
|
mcpbr/templates.py,sha256=dqwboVB-yfE06w2rgDOvuWJB4Hx5duH_W-jvLBqmlKg,10683
|
|
39
55
|
mcpbr/benchmarks/__init__.py,sha256=2-7Ebg6-wHo1QGfVKWjjbREcLG_A-6Q0XfZGiyXrOeE,4489
|
|
40
56
|
mcpbr/benchmarks/adversarial.py,sha256=69VBTZv6BhR1JwjQepA_YwAu3b--vJviGd6IWs2h1QA,12357
|
|
@@ -80,15 +96,15 @@ mcpbr/infrastructure/azure_health.py,sha256=xITmIa9IfYIwxcVhY0sJ81a-6WNKiT8kSQTd
|
|
|
80
96
|
mcpbr/infrastructure/base.py,sha256=Olj6uiNBeGoUqltZI1NHZfa26kzT-6jfp8YIXSykFKM,3037
|
|
81
97
|
mcpbr/infrastructure/local.py,sha256=VK6UAg7Dzvb9v1LAJgNGA_s0blQKrHAQEXBAC75zAL8,4237
|
|
82
98
|
mcpbr/infrastructure/manager.py,sha256=j0T7U1Tbajmfve4SNfhYKikvL9kgSVT01fYKMC-sH-s,4796
|
|
83
|
-
mcpbr-0.
|
|
84
|
-
mcpbr-0.
|
|
85
|
-
mcpbr-0.
|
|
86
|
-
mcpbr-0.
|
|
87
|
-
mcpbr-0.
|
|
88
|
-
mcpbr-0.
|
|
89
|
-
mcpbr-0.
|
|
90
|
-
mcpbr-0.
|
|
91
|
-
mcpbr-0.
|
|
92
|
-
mcpbr-0.
|
|
93
|
-
mcpbr-0.
|
|
94
|
-
mcpbr-0.
|
|
99
|
+
mcpbr-0.6.0.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
|
|
100
|
+
mcpbr-0.6.0.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
|
|
101
|
+
mcpbr-0.6.0.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
|
|
102
|
+
mcpbr-0.6.0.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
|
|
103
|
+
mcpbr-0.6.0.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
|
|
104
|
+
mcpbr-0.6.0.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
|
|
105
|
+
mcpbr-0.6.0.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
|
|
106
|
+
mcpbr-0.6.0.dist-info/METADATA,sha256=sNd6FYrOQ1tgz6bum5D03w373dEJfXODVLfmLsiy5hk,55376
|
|
107
|
+
mcpbr-0.6.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
108
|
+
mcpbr-0.6.0.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
|
|
109
|
+
mcpbr-0.6.0.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
|
|
110
|
+
mcpbr-0.6.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|