mcpbr 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/__init__.py +20 -1
- mcpbr/config.py +37 -1
- mcpbr/docker_env.py +2 -1
- mcpbr/docker_prewarm.py +2 -1
- mcpbr/dry_run.py +2 -1
- mcpbr/gpu_support.py +2 -1
- mcpbr/graceful_degradation.py +277 -0
- mcpbr/languages.py +228 -0
- mcpbr/logging_config.py +207 -0
- mcpbr/models.py +66 -0
- mcpbr/preflight.py +2 -1
- mcpbr/pricing.py +72 -0
- mcpbr/providers.py +316 -3
- mcpbr/sdk.py +264 -0
- mcpbr/smoke_test.py +2 -1
- {mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
- {mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/RECORD +27 -23
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
- {mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/providers.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""Model provider abstractions for different LLM APIs."""
|
|
2
2
|
|
|
3
|
+
import json
|
|
3
4
|
import os
|
|
5
|
+
import uuid
|
|
4
6
|
from dataclasses import dataclass, field
|
|
5
7
|
from typing import Any, Protocol, runtime_checkable
|
|
6
8
|
|
|
@@ -142,8 +144,6 @@ class AnthropicProvider:
|
|
|
142
144
|
if block.type == "text":
|
|
143
145
|
content_text = block.text
|
|
144
146
|
elif block.type == "tool_use":
|
|
145
|
-
import json
|
|
146
|
-
|
|
147
147
|
tool_calls.append(
|
|
148
148
|
ToolCall(
|
|
149
149
|
id=block.id,
|
|
@@ -170,8 +170,321 @@ class AnthropicProvider:
|
|
|
170
170
|
)
|
|
171
171
|
|
|
172
172
|
|
|
173
|
+
class OpenAIProvider:
|
|
174
|
+
"""Provider for OpenAI API (GPT models)."""
|
|
175
|
+
|
|
176
|
+
def __init__(
|
|
177
|
+
self,
|
|
178
|
+
model: str,
|
|
179
|
+
api_key: str | None = None,
|
|
180
|
+
) -> None:
|
|
181
|
+
"""Initialize OpenAI provider.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
model: OpenAI model ID (e.g., 'gpt-4o', 'gpt-4-turbo').
|
|
185
|
+
api_key: API key. If None, uses OPENAI_API_KEY env var.
|
|
186
|
+
"""
|
|
187
|
+
self._model = model
|
|
188
|
+
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
|
189
|
+
if not self._api_key:
|
|
190
|
+
raise ValueError(
|
|
191
|
+
"OpenAI API key required. Set OPENAI_API_KEY environment variable "
|
|
192
|
+
"or pass api_key parameter."
|
|
193
|
+
)
|
|
194
|
+
import openai
|
|
195
|
+
|
|
196
|
+
self._client = openai.OpenAI(api_key=self._api_key)
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def model(self) -> str:
|
|
200
|
+
return self._model
|
|
201
|
+
|
|
202
|
+
def get_tool_format(self) -> str:
|
|
203
|
+
return "openai"
|
|
204
|
+
|
|
205
|
+
def chat(
|
|
206
|
+
self,
|
|
207
|
+
messages: list[dict[str, Any]],
|
|
208
|
+
tools: list[dict[str, Any]] | None = None,
|
|
209
|
+
max_tokens: int = 4096,
|
|
210
|
+
) -> ChatResponse:
|
|
211
|
+
kwargs: dict[str, Any] = {
|
|
212
|
+
"model": self._model,
|
|
213
|
+
"messages": messages,
|
|
214
|
+
"max_tokens": max_tokens,
|
|
215
|
+
}
|
|
216
|
+
if tools:
|
|
217
|
+
kwargs["tools"] = tools
|
|
218
|
+
|
|
219
|
+
response = self._client.chat.completions.create(**kwargs)
|
|
220
|
+
|
|
221
|
+
if not response.choices:
|
|
222
|
+
raise RuntimeError("OpenAI API returned empty response choices")
|
|
223
|
+
|
|
224
|
+
choice = response.choices[0]
|
|
225
|
+
tool_calls = []
|
|
226
|
+
if choice.message.tool_calls:
|
|
227
|
+
for tc in choice.message.tool_calls:
|
|
228
|
+
tool_calls.append(
|
|
229
|
+
ToolCall(
|
|
230
|
+
id=tc.id,
|
|
231
|
+
name=tc.function.name,
|
|
232
|
+
arguments=tc.function.arguments,
|
|
233
|
+
)
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return ChatResponse(
|
|
237
|
+
message=ChatMessage(
|
|
238
|
+
role="assistant",
|
|
239
|
+
content=choice.message.content,
|
|
240
|
+
tool_calls=tool_calls,
|
|
241
|
+
),
|
|
242
|
+
finish_reason=choice.finish_reason,
|
|
243
|
+
input_tokens=response.usage.prompt_tokens,
|
|
244
|
+
output_tokens=response.usage.completion_tokens,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class GeminiProvider:
|
|
249
|
+
"""Provider for Google Gemini API."""
|
|
250
|
+
|
|
251
|
+
def __init__(
|
|
252
|
+
self,
|
|
253
|
+
model: str,
|
|
254
|
+
api_key: str | None = None,
|
|
255
|
+
) -> None:
|
|
256
|
+
"""Initialize Gemini provider.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
model: Gemini model ID (e.g., 'gemini-2.0-flash', 'gemini-1.5-pro').
|
|
260
|
+
api_key: API key. If None, uses GOOGLE_API_KEY env var.
|
|
261
|
+
"""
|
|
262
|
+
self._model = model
|
|
263
|
+
self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
|
|
264
|
+
if not self._api_key:
|
|
265
|
+
raise ValueError(
|
|
266
|
+
"Google API key required. Set GOOGLE_API_KEY environment variable "
|
|
267
|
+
"or pass api_key parameter."
|
|
268
|
+
)
|
|
269
|
+
import google.generativeai as genai
|
|
270
|
+
|
|
271
|
+
genai.configure(api_key=self._api_key)
|
|
272
|
+
self._genai = genai
|
|
273
|
+
self._client = genai.GenerativeModel(model)
|
|
274
|
+
|
|
275
|
+
@property
|
|
276
|
+
def model(self) -> str:
|
|
277
|
+
return self._model
|
|
278
|
+
|
|
279
|
+
def get_tool_format(self) -> str:
|
|
280
|
+
return "openai"
|
|
281
|
+
|
|
282
|
+
def _convert_messages(
|
|
283
|
+
self, messages: list[dict[str, Any]]
|
|
284
|
+
) -> tuple[list[dict[str, Any]], str | None]:
|
|
285
|
+
"""Convert OpenAI-style messages to Gemini content format.
|
|
286
|
+
|
|
287
|
+
Extracts system messages to use as system_instruction (Gemini's native
|
|
288
|
+
system prompt support), and converts the remaining messages.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
messages: List of OpenAI-style message dicts.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Tuple of (contents, system_instruction). system_instruction is None
|
|
295
|
+
if no system message was found.
|
|
296
|
+
"""
|
|
297
|
+
contents: list[dict[str, Any]] = []
|
|
298
|
+
system_instruction: str | None = None
|
|
299
|
+
for msg in messages:
|
|
300
|
+
role = msg.get("role", "user")
|
|
301
|
+
if role == "system":
|
|
302
|
+
system_instruction = msg.get("content", "")
|
|
303
|
+
elif role == "assistant":
|
|
304
|
+
contents.append({"role": "model", "parts": [msg.get("content", "")]})
|
|
305
|
+
else:
|
|
306
|
+
contents.append({"role": role, "parts": [msg.get("content", "")]})
|
|
307
|
+
return contents, system_instruction
|
|
308
|
+
|
|
309
|
+
def _convert_tools(self, tools: list[dict[str, Any]] | None) -> list[Any] | None:
|
|
310
|
+
"""Convert OpenAI-style tool definitions to Gemini function declarations.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
tools: List of OpenAI-style tool dicts.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
List of Gemini Tool objects, or None.
|
|
317
|
+
"""
|
|
318
|
+
if not tools:
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
function_declarations = []
|
|
322
|
+
for tool in tools:
|
|
323
|
+
func = tool.get("function", {})
|
|
324
|
+
function_declarations.append(
|
|
325
|
+
self._genai.protos.FunctionDeclaration(
|
|
326
|
+
name=func.get("name", ""),
|
|
327
|
+
description=func.get("description", ""),
|
|
328
|
+
parameters=func.get("parameters"),
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
return [self._genai.protos.Tool(function_declarations=function_declarations)]
|
|
332
|
+
|
|
333
|
+
def chat(
|
|
334
|
+
self,
|
|
335
|
+
messages: list[dict[str, Any]],
|
|
336
|
+
tools: list[dict[str, Any]] | None = None,
|
|
337
|
+
max_tokens: int = 4096,
|
|
338
|
+
) -> ChatResponse:
|
|
339
|
+
contents, system_instruction = self._convert_messages(messages)
|
|
340
|
+
gemini_tools = self._convert_tools(tools)
|
|
341
|
+
|
|
342
|
+
kwargs: dict[str, Any] = {
|
|
343
|
+
"contents": contents,
|
|
344
|
+
"generation_config": {"max_output_tokens": max_tokens},
|
|
345
|
+
}
|
|
346
|
+
if gemini_tools:
|
|
347
|
+
kwargs["tools"] = gemini_tools
|
|
348
|
+
if system_instruction:
|
|
349
|
+
kwargs["system_instruction"] = system_instruction
|
|
350
|
+
|
|
351
|
+
response = self._client.generate_content(**kwargs)
|
|
352
|
+
|
|
353
|
+
if not response.candidates:
|
|
354
|
+
raise RuntimeError("Gemini API returned empty candidates")
|
|
355
|
+
|
|
356
|
+
content_text = ""
|
|
357
|
+
tool_calls = []
|
|
358
|
+
candidate = response.candidates[0]
|
|
359
|
+
|
|
360
|
+
for part in candidate.content.parts:
|
|
361
|
+
if part.function_call and part.function_call.name:
|
|
362
|
+
args_dict = dict(part.function_call.args) if part.function_call.args else {}
|
|
363
|
+
tool_calls.append(
|
|
364
|
+
ToolCall(
|
|
365
|
+
id=f"call_{uuid.uuid4().hex[:24]}",
|
|
366
|
+
name=part.function_call.name,
|
|
367
|
+
arguments=json.dumps(args_dict),
|
|
368
|
+
)
|
|
369
|
+
)
|
|
370
|
+
elif part.text:
|
|
371
|
+
content_text = part.text
|
|
372
|
+
|
|
373
|
+
finish_reason = "stop"
|
|
374
|
+
if tool_calls:
|
|
375
|
+
finish_reason = "tool_calls"
|
|
376
|
+
elif hasattr(candidate.finish_reason, "name"):
|
|
377
|
+
reason_name = candidate.finish_reason.name
|
|
378
|
+
if reason_name == "STOP":
|
|
379
|
+
finish_reason = "stop"
|
|
380
|
+
elif reason_name == "MAX_TOKENS":
|
|
381
|
+
finish_reason = "length"
|
|
382
|
+
|
|
383
|
+
return ChatResponse(
|
|
384
|
+
message=ChatMessage(
|
|
385
|
+
role="assistant",
|
|
386
|
+
content=content_text if content_text else None,
|
|
387
|
+
tool_calls=tool_calls,
|
|
388
|
+
),
|
|
389
|
+
finish_reason=finish_reason,
|
|
390
|
+
input_tokens=getattr(response.usage_metadata, "prompt_token_count", 0)
|
|
391
|
+
if response.usage_metadata
|
|
392
|
+
else 0,
|
|
393
|
+
output_tokens=getattr(response.usage_metadata, "candidates_token_count", 0)
|
|
394
|
+
if response.usage_metadata
|
|
395
|
+
else 0,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
class QwenProvider:
|
|
400
|
+
"""Provider for Alibaba Qwen API (OpenAI-compatible via DashScope).
|
|
401
|
+
|
|
402
|
+
Qwen models are accessed through the DashScope international API endpoint
|
|
403
|
+
which provides an OpenAI-compatible interface.
|
|
404
|
+
"""
|
|
405
|
+
|
|
406
|
+
DASHSCOPE_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
|
407
|
+
|
|
408
|
+
def __init__(
|
|
409
|
+
self,
|
|
410
|
+
model: str,
|
|
411
|
+
api_key: str | None = None,
|
|
412
|
+
) -> None:
|
|
413
|
+
"""Initialize Qwen provider.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
model: Qwen model ID (e.g., 'qwen-plus', 'qwen-turbo', 'qwen-max').
|
|
417
|
+
api_key: API key. If None, uses DASHSCOPE_API_KEY env var.
|
|
418
|
+
"""
|
|
419
|
+
self._model = model
|
|
420
|
+
self._api_key = api_key or os.environ.get("DASHSCOPE_API_KEY")
|
|
421
|
+
if not self._api_key:
|
|
422
|
+
raise ValueError(
|
|
423
|
+
"DashScope API key required. Set DASHSCOPE_API_KEY environment variable "
|
|
424
|
+
"or pass api_key parameter."
|
|
425
|
+
)
|
|
426
|
+
import openai
|
|
427
|
+
|
|
428
|
+
self._client = openai.OpenAI(
|
|
429
|
+
api_key=self._api_key,
|
|
430
|
+
base_url=self.DASHSCOPE_BASE_URL,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
@property
|
|
434
|
+
def model(self) -> str:
|
|
435
|
+
return self._model
|
|
436
|
+
|
|
437
|
+
def get_tool_format(self) -> str:
|
|
438
|
+
return "openai"
|
|
439
|
+
|
|
440
|
+
def chat(
|
|
441
|
+
self,
|
|
442
|
+
messages: list[dict[str, Any]],
|
|
443
|
+
tools: list[dict[str, Any]] | None = None,
|
|
444
|
+
max_tokens: int = 4096,
|
|
445
|
+
) -> ChatResponse:
|
|
446
|
+
kwargs: dict[str, Any] = {
|
|
447
|
+
"model": self._model,
|
|
448
|
+
"messages": messages,
|
|
449
|
+
"max_tokens": max_tokens,
|
|
450
|
+
}
|
|
451
|
+
if tools:
|
|
452
|
+
kwargs["tools"] = tools
|
|
453
|
+
|
|
454
|
+
response = self._client.chat.completions.create(**kwargs)
|
|
455
|
+
|
|
456
|
+
if not response.choices:
|
|
457
|
+
raise RuntimeError("Qwen API returned empty response choices")
|
|
458
|
+
|
|
459
|
+
choice = response.choices[0]
|
|
460
|
+
tool_calls = []
|
|
461
|
+
if choice.message.tool_calls:
|
|
462
|
+
for tc in choice.message.tool_calls:
|
|
463
|
+
tool_calls.append(
|
|
464
|
+
ToolCall(
|
|
465
|
+
id=tc.id,
|
|
466
|
+
name=tc.function.name,
|
|
467
|
+
arguments=tc.function.arguments,
|
|
468
|
+
)
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
return ChatResponse(
|
|
472
|
+
message=ChatMessage(
|
|
473
|
+
role="assistant",
|
|
474
|
+
content=choice.message.content,
|
|
475
|
+
tool_calls=tool_calls,
|
|
476
|
+
),
|
|
477
|
+
finish_reason=choice.finish_reason,
|
|
478
|
+
input_tokens=response.usage.prompt_tokens,
|
|
479
|
+
output_tokens=response.usage.completion_tokens,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
|
|
173
483
|
PROVIDER_REGISTRY: dict[str, type] = {
|
|
174
484
|
"anthropic": AnthropicProvider,
|
|
485
|
+
"openai": OpenAIProvider,
|
|
486
|
+
"gemini": GeminiProvider,
|
|
487
|
+
"qwen": QwenProvider,
|
|
175
488
|
}
|
|
176
489
|
|
|
177
490
|
|
|
@@ -183,7 +496,7 @@ def create_provider(
|
|
|
183
496
|
"""Factory function to create a model provider.
|
|
184
497
|
|
|
185
498
|
Args:
|
|
186
|
-
provider_name: Name of the provider (
|
|
499
|
+
provider_name: Name of the provider ('anthropic', 'openai', 'gemini', 'qwen').
|
|
187
500
|
model: Model identifier for the provider.
|
|
188
501
|
api_key: Optional API key.
|
|
189
502
|
|
mcpbr/sdk.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""Public Python SDK for mcpbr.
|
|
2
|
+
|
|
3
|
+
Provides a programmatic interface for running MCP server benchmarks
|
|
4
|
+
from Python code, without requiring the CLI.
|
|
5
|
+
|
|
6
|
+
Example usage::
|
|
7
|
+
|
|
8
|
+
from mcpbr import MCPBenchmark, list_benchmarks, list_models
|
|
9
|
+
|
|
10
|
+
# List available benchmarks
|
|
11
|
+
for b in list_benchmarks():
|
|
12
|
+
print(b["name"])
|
|
13
|
+
|
|
14
|
+
# Create and run a benchmark
|
|
15
|
+
bench = MCPBenchmark({
|
|
16
|
+
"mcp_server": {
|
|
17
|
+
"command": "npx",
|
|
18
|
+
"args": ["-y", "@modelcontextprotocol/server-filesystem", "{workdir}"],
|
|
19
|
+
},
|
|
20
|
+
"benchmark": "humaneval",
|
|
21
|
+
"model": "sonnet",
|
|
22
|
+
})
|
|
23
|
+
|
|
24
|
+
is_valid, errors = bench.validate()
|
|
25
|
+
plan = bench.dry_run()
|
|
26
|
+
|
|
27
|
+
# Async execution
|
|
28
|
+
import asyncio
|
|
29
|
+
result = asyncio.run(bench.run())
|
|
30
|
+
print(result.success, result.summary)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from dataclasses import dataclass
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from typing import Any
|
|
36
|
+
|
|
37
|
+
from . import __version__
|
|
38
|
+
from .benchmarks import BENCHMARK_REGISTRY
|
|
39
|
+
from .config import VALID_PROVIDERS, HarnessConfig, load_config
|
|
40
|
+
from .models import SUPPORTED_MODELS, validate_model
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class BenchmarkResult:
|
|
45
|
+
"""Result of a benchmark run.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
success: Whether the benchmark completed successfully.
|
|
49
|
+
summary: Aggregated results (e.g., pass rate, resolved count).
|
|
50
|
+
tasks: Per-task results as a list of dicts.
|
|
51
|
+
metadata: Run metadata (benchmark name, model, timestamps, etc.).
|
|
52
|
+
total_cost: Total API cost in USD.
|
|
53
|
+
total_tokens: Total tokens consumed.
|
|
54
|
+
duration_seconds: Wall-clock duration of the run.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
success: bool
|
|
58
|
+
summary: dict[str, Any]
|
|
59
|
+
tasks: list[dict[str, Any]]
|
|
60
|
+
metadata: dict[str, Any]
|
|
61
|
+
total_cost: float = 0.0
|
|
62
|
+
total_tokens: int = 0
|
|
63
|
+
duration_seconds: float = 0.0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class MCPBenchmark:
|
|
67
|
+
"""High-level interface for configuring and running MCP benchmarks.
|
|
68
|
+
|
|
69
|
+
Can be initialized from a config dict, a YAML file path (str or Path),
|
|
70
|
+
or an existing HarnessConfig instance.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
config: A dict of config values, a path to a YAML config file
|
|
74
|
+
(str or Path), or a HarnessConfig instance.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
FileNotFoundError: If a file path is given and the file does not exist.
|
|
78
|
+
ValueError: If the config dict is invalid.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(self, config: dict[str, Any] | str | Path | HarnessConfig) -> None:
|
|
82
|
+
if isinstance(config, HarnessConfig):
|
|
83
|
+
self.config: HarnessConfig = config
|
|
84
|
+
elif isinstance(config, (str, Path)):
|
|
85
|
+
path = Path(config)
|
|
86
|
+
if not path.exists():
|
|
87
|
+
raise FileNotFoundError(f"Config file not found: {path}")
|
|
88
|
+
self.config = load_config(path, warn_security=False)
|
|
89
|
+
elif isinstance(config, dict):
|
|
90
|
+
self.config = HarnessConfig(**config)
|
|
91
|
+
else:
|
|
92
|
+
raise TypeError(
|
|
93
|
+
f"config must be a dict, str, Path, or HarnessConfig, got {type(config).__name__}"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def validate(self) -> tuple[bool, list[str]]:
|
|
97
|
+
"""Validate the current configuration.
|
|
98
|
+
|
|
99
|
+
Checks that the configuration is internally consistent, the model
|
|
100
|
+
is recognized, and required fields are present.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
A tuple of (is_valid, list_of_warnings_or_errors).
|
|
104
|
+
"""
|
|
105
|
+
errors: list[str] = []
|
|
106
|
+
|
|
107
|
+
# Validate model is in the supported registry
|
|
108
|
+
model_valid, model_error = validate_model(self.config.model)
|
|
109
|
+
if not model_valid:
|
|
110
|
+
errors.append(f"Model warning: {model_error}")
|
|
111
|
+
|
|
112
|
+
# Validate benchmark is in the registry
|
|
113
|
+
if self.config.benchmark not in BENCHMARK_REGISTRY:
|
|
114
|
+
errors.append(
|
|
115
|
+
f"Unknown benchmark: {self.config.benchmark}. "
|
|
116
|
+
f"Available: {', '.join(BENCHMARK_REGISTRY.keys())}"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Validate provider
|
|
120
|
+
if self.config.provider not in VALID_PROVIDERS:
|
|
121
|
+
errors.append(
|
|
122
|
+
f"Unknown provider: {self.config.provider}. Available: {', '.join(VALID_PROVIDERS)}"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
is_valid = len(errors) == 0
|
|
126
|
+
return is_valid, errors
|
|
127
|
+
|
|
128
|
+
def dry_run(self) -> dict[str, Any]:
|
|
129
|
+
"""Generate an execution plan without running anything.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
A dict describing what would be executed, including benchmark,
|
|
133
|
+
model, provider, MCP server config, and runtime settings.
|
|
134
|
+
"""
|
|
135
|
+
plan: dict[str, Any] = {
|
|
136
|
+
"benchmark": self.config.benchmark,
|
|
137
|
+
"model": self.config.model,
|
|
138
|
+
"provider": self.config.provider,
|
|
139
|
+
"agent_harness": self.config.agent_harness,
|
|
140
|
+
"timeout_seconds": self.config.timeout_seconds,
|
|
141
|
+
"max_concurrent": self.config.max_concurrent,
|
|
142
|
+
"max_iterations": self.config.max_iterations,
|
|
143
|
+
"sample_size": self.config.sample_size,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
# Include MCP server info
|
|
147
|
+
if self.config.mcp_server:
|
|
148
|
+
plan["mcp_server"] = {
|
|
149
|
+
"command": self.config.mcp_server.command,
|
|
150
|
+
"args": self.config.mcp_server.args,
|
|
151
|
+
"name": self.config.mcp_server.name,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
# Include comparison mode info if applicable
|
|
155
|
+
if self.config.comparison_mode:
|
|
156
|
+
plan["comparison_mode"] = True
|
|
157
|
+
if self.config.mcp_server_a:
|
|
158
|
+
plan["mcp_server_a"] = {
|
|
159
|
+
"command": self.config.mcp_server_a.command,
|
|
160
|
+
"args": self.config.mcp_server_a.args,
|
|
161
|
+
"name": self.config.mcp_server_a.name,
|
|
162
|
+
}
|
|
163
|
+
if self.config.mcp_server_b:
|
|
164
|
+
plan["mcp_server_b"] = {
|
|
165
|
+
"command": self.config.mcp_server_b.command,
|
|
166
|
+
"args": self.config.mcp_server_b.args,
|
|
167
|
+
"name": self.config.mcp_server_b.name,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
# Optional settings
|
|
171
|
+
if self.config.budget is not None:
|
|
172
|
+
plan["budget"] = self.config.budget
|
|
173
|
+
if self.config.thinking_budget is not None:
|
|
174
|
+
plan["thinking_budget"] = self.config.thinking_budget
|
|
175
|
+
if self.config.agent_prompt is not None:
|
|
176
|
+
plan["agent_prompt"] = self.config.agent_prompt
|
|
177
|
+
|
|
178
|
+
return plan
|
|
179
|
+
|
|
180
|
+
async def run(self, **kwargs: Any) -> BenchmarkResult:
|
|
181
|
+
"""Execute the benchmark.
|
|
182
|
+
|
|
183
|
+
This is the main entry point for running a benchmark programmatically.
|
|
184
|
+
It delegates to the internal _execute method, which can be overridden
|
|
185
|
+
or mocked for testing.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
**kwargs: Additional keyword arguments passed to the executor.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
BenchmarkResult with the evaluation results.
|
|
192
|
+
"""
|
|
193
|
+
return await self._execute(**kwargs)
|
|
194
|
+
|
|
195
|
+
async def _execute(self, **kwargs: Any) -> BenchmarkResult:
|
|
196
|
+
"""Internal execution method.
|
|
197
|
+
|
|
198
|
+
Override or mock this method for testing. In production, this
|
|
199
|
+
would orchestrate the full benchmark pipeline (task loading,
|
|
200
|
+
environment creation, agent execution, evaluation).
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
**kwargs: Additional keyword arguments.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
BenchmarkResult with the evaluation results.
|
|
207
|
+
|
|
208
|
+
Raises:
|
|
209
|
+
NotImplementedError: Full execution pipeline is not yet
|
|
210
|
+
wired into the SDK. Use the CLI for actual runs.
|
|
211
|
+
"""
|
|
212
|
+
raise NotImplementedError(
|
|
213
|
+
"Full benchmark execution via the SDK is not yet implemented. "
|
|
214
|
+
"Use the `mcpbr` CLI for actual benchmark runs, or mock "
|
|
215
|
+
"MCPBenchmark._execute for testing."
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def list_benchmarks() -> list[dict[str, str]]:
|
|
220
|
+
"""List all available benchmarks.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
A list of dicts, each containing 'name' (the benchmark identifier)
|
|
224
|
+
and 'class' (the benchmark class name).
|
|
225
|
+
"""
|
|
226
|
+
return [{"name": name, "class": cls.__name__} for name, cls in BENCHMARK_REGISTRY.items()]
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def list_providers() -> list[str]:
|
|
230
|
+
"""List all supported model providers.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
A list of provider name strings.
|
|
234
|
+
"""
|
|
235
|
+
return list(VALID_PROVIDERS)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def list_models() -> list[dict[str, str]]:
|
|
239
|
+
"""List all supported models with their metadata.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
A list of dicts, each containing 'id', 'provider',
|
|
243
|
+
'display_name', 'context_window', 'supports_tools', and 'notes'.
|
|
244
|
+
"""
|
|
245
|
+
return [
|
|
246
|
+
{
|
|
247
|
+
"id": info.id,
|
|
248
|
+
"provider": info.provider,
|
|
249
|
+
"display_name": info.display_name,
|
|
250
|
+
"context_window": info.context_window,
|
|
251
|
+
"supports_tools": info.supports_tools,
|
|
252
|
+
"notes": info.notes,
|
|
253
|
+
}
|
|
254
|
+
for info in SUPPORTED_MODELS.values()
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def get_version() -> str:
|
|
259
|
+
"""Get the current mcpbr version.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
The version string (e.g., '0.6.0').
|
|
263
|
+
"""
|
|
264
|
+
return __version__
|
mcpbr/smoke_test.py
CHANGED
|
@@ -7,12 +7,13 @@ from dataclasses import dataclass
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from typing import Any
|
|
9
9
|
|
|
10
|
-
import docker
|
|
11
10
|
from anthropic import Anthropic
|
|
12
11
|
from rich.console import Console
|
|
13
12
|
from rich.panel import Panel
|
|
14
13
|
from rich.table import Table
|
|
15
14
|
|
|
15
|
+
import docker
|
|
16
|
+
|
|
16
17
|
from .config import load_config
|
|
17
18
|
from .config_validator import validate_config
|
|
18
19
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcpbr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
|
|
5
5
|
Project-URL: Homepage, https://github.com/greynewell/mcpbr
|
|
6
6
|
Project-URL: Repository, https://github.com/greynewell/mcpbr
|
|
@@ -30,6 +30,9 @@ Requires-Dist: pydantic>=2.0.0
|
|
|
30
30
|
Requires-Dist: pyyaml>=6.0.0
|
|
31
31
|
Requires-Dist: requests>=2.31.0
|
|
32
32
|
Requires-Dist: rich>=13.0.0
|
|
33
|
+
Provides-Extra: all-providers
|
|
34
|
+
Requires-Dist: google-generativeai>=0.3.0; extra == 'all-providers'
|
|
35
|
+
Requires-Dist: openai>=1.0.0; extra == 'all-providers'
|
|
33
36
|
Provides-Extra: dev
|
|
34
37
|
Requires-Dist: pre-commit>=3.0.0; extra == 'dev'
|
|
35
38
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
@@ -40,6 +43,10 @@ Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
|
|
|
40
43
|
Requires-Dist: mkdocs-minify-plugin>=0.7.0; extra == 'docs'
|
|
41
44
|
Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
|
|
42
45
|
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
|
|
46
|
+
Provides-Extra: gemini
|
|
47
|
+
Requires-Dist: google-generativeai>=0.3.0; extra == 'gemini'
|
|
48
|
+
Provides-Extra: openai
|
|
49
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
43
50
|
Description-Content-Type: text/markdown
|
|
44
51
|
|
|
45
52
|
# mcpbr
|