mcpbr 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcpbr/providers.py CHANGED
@@ -1,6 +1,8 @@
1
1
  """Model provider abstractions for different LLM APIs."""
2
2
 
3
+ import json
3
4
  import os
5
+ import uuid
4
6
  from dataclasses import dataclass, field
5
7
  from typing import Any, Protocol, runtime_checkable
6
8
 
@@ -142,8 +144,6 @@ class AnthropicProvider:
142
144
  if block.type == "text":
143
145
  content_text = block.text
144
146
  elif block.type == "tool_use":
145
- import json
146
-
147
147
  tool_calls.append(
148
148
  ToolCall(
149
149
  id=block.id,
@@ -170,8 +170,321 @@ class AnthropicProvider:
170
170
  )
171
171
 
172
172
 
173
+ class OpenAIProvider:
174
+ """Provider for OpenAI API (GPT models)."""
175
+
176
+ def __init__(
177
+ self,
178
+ model: str,
179
+ api_key: str | None = None,
180
+ ) -> None:
181
+ """Initialize OpenAI provider.
182
+
183
+ Args:
184
+ model: OpenAI model ID (e.g., 'gpt-4o', 'gpt-4-turbo').
185
+ api_key: API key. If None, uses OPENAI_API_KEY env var.
186
+ """
187
+ self._model = model
188
+ self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
189
+ if not self._api_key:
190
+ raise ValueError(
191
+ "OpenAI API key required. Set OPENAI_API_KEY environment variable "
192
+ "or pass api_key parameter."
193
+ )
194
+ import openai
195
+
196
+ self._client = openai.OpenAI(api_key=self._api_key)
197
+
198
+ @property
199
+ def model(self) -> str:
200
+ return self._model
201
+
202
+ def get_tool_format(self) -> str:
203
+ return "openai"
204
+
205
+ def chat(
206
+ self,
207
+ messages: list[dict[str, Any]],
208
+ tools: list[dict[str, Any]] | None = None,
209
+ max_tokens: int = 4096,
210
+ ) -> ChatResponse:
211
+ kwargs: dict[str, Any] = {
212
+ "model": self._model,
213
+ "messages": messages,
214
+ "max_tokens": max_tokens,
215
+ }
216
+ if tools:
217
+ kwargs["tools"] = tools
218
+
219
+ response = self._client.chat.completions.create(**kwargs)
220
+
221
+ if not response.choices:
222
+ raise RuntimeError("OpenAI API returned empty response choices")
223
+
224
+ choice = response.choices[0]
225
+ tool_calls = []
226
+ if choice.message.tool_calls:
227
+ for tc in choice.message.tool_calls:
228
+ tool_calls.append(
229
+ ToolCall(
230
+ id=tc.id,
231
+ name=tc.function.name,
232
+ arguments=tc.function.arguments,
233
+ )
234
+ )
235
+
236
+ return ChatResponse(
237
+ message=ChatMessage(
238
+ role="assistant",
239
+ content=choice.message.content,
240
+ tool_calls=tool_calls,
241
+ ),
242
+ finish_reason=choice.finish_reason,
243
+ input_tokens=response.usage.prompt_tokens,
244
+ output_tokens=response.usage.completion_tokens,
245
+ )
246
+
247
+
248
+ class GeminiProvider:
249
+ """Provider for Google Gemini API."""
250
+
251
+ def __init__(
252
+ self,
253
+ model: str,
254
+ api_key: str | None = None,
255
+ ) -> None:
256
+ """Initialize Gemini provider.
257
+
258
+ Args:
259
+ model: Gemini model ID (e.g., 'gemini-2.0-flash', 'gemini-1.5-pro').
260
+ api_key: API key. If None, uses GOOGLE_API_KEY env var.
261
+ """
262
+ self._model = model
263
+ self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
264
+ if not self._api_key:
265
+ raise ValueError(
266
+ "Google API key required. Set GOOGLE_API_KEY environment variable "
267
+ "or pass api_key parameter."
268
+ )
269
+ import google.generativeai as genai
270
+
271
+ genai.configure(api_key=self._api_key)
272
+ self._genai = genai
273
+ self._client = genai.GenerativeModel(model)
274
+
275
+ @property
276
+ def model(self) -> str:
277
+ return self._model
278
+
279
+ def get_tool_format(self) -> str:
280
+ return "openai"
281
+
282
+ def _convert_messages(
283
+ self, messages: list[dict[str, Any]]
284
+ ) -> tuple[list[dict[str, Any]], str | None]:
285
+ """Convert OpenAI-style messages to Gemini content format.
286
+
287
+ Extracts system messages to use as system_instruction (Gemini's native
288
+ system prompt support), and converts the remaining messages.
289
+
290
+ Args:
291
+ messages: List of OpenAI-style message dicts.
292
+
293
+ Returns:
294
+ Tuple of (contents, system_instruction). system_instruction is None
295
+ if no system message was found.
296
+ """
297
+ contents: list[dict[str, Any]] = []
298
+ system_instruction: str | None = None
299
+ for msg in messages:
300
+ role = msg.get("role", "user")
301
+ if role == "system":
302
+ system_instruction = msg.get("content", "")
303
+ elif role == "assistant":
304
+ contents.append({"role": "model", "parts": [msg.get("content", "")]})
305
+ else:
306
+ contents.append({"role": role, "parts": [msg.get("content", "")]})
307
+ return contents, system_instruction
308
+
309
+ def _convert_tools(self, tools: list[dict[str, Any]] | None) -> list[Any] | None:
310
+ """Convert OpenAI-style tool definitions to Gemini function declarations.
311
+
312
+ Args:
313
+ tools: List of OpenAI-style tool dicts.
314
+
315
+ Returns:
316
+ List of Gemini Tool objects, or None.
317
+ """
318
+ if not tools:
319
+ return None
320
+
321
+ function_declarations = []
322
+ for tool in tools:
323
+ func = tool.get("function", {})
324
+ function_declarations.append(
325
+ self._genai.protos.FunctionDeclaration(
326
+ name=func.get("name", ""),
327
+ description=func.get("description", ""),
328
+ parameters=func.get("parameters"),
329
+ )
330
+ )
331
+ return [self._genai.protos.Tool(function_declarations=function_declarations)]
332
+
333
+ def chat(
334
+ self,
335
+ messages: list[dict[str, Any]],
336
+ tools: list[dict[str, Any]] | None = None,
337
+ max_tokens: int = 4096,
338
+ ) -> ChatResponse:
339
+ contents, system_instruction = self._convert_messages(messages)
340
+ gemini_tools = self._convert_tools(tools)
341
+
342
+ kwargs: dict[str, Any] = {
343
+ "contents": contents,
344
+ "generation_config": {"max_output_tokens": max_tokens},
345
+ }
346
+ if gemini_tools:
347
+ kwargs["tools"] = gemini_tools
348
+ if system_instruction:
349
+ kwargs["system_instruction"] = system_instruction
350
+
351
+ response = self._client.generate_content(**kwargs)
352
+
353
+ if not response.candidates:
354
+ raise RuntimeError("Gemini API returned empty candidates")
355
+
356
+ content_text = ""
357
+ tool_calls = []
358
+ candidate = response.candidates[0]
359
+
360
+ for part in candidate.content.parts:
361
+ if part.function_call and part.function_call.name:
362
+ args_dict = dict(part.function_call.args) if part.function_call.args else {}
363
+ tool_calls.append(
364
+ ToolCall(
365
+ id=f"call_{uuid.uuid4().hex[:24]}",
366
+ name=part.function_call.name,
367
+ arguments=json.dumps(args_dict),
368
+ )
369
+ )
370
+ elif part.text:
371
+ content_text = part.text
372
+
373
+ finish_reason = "stop"
374
+ if tool_calls:
375
+ finish_reason = "tool_calls"
376
+ elif hasattr(candidate.finish_reason, "name"):
377
+ reason_name = candidate.finish_reason.name
378
+ if reason_name == "STOP":
379
+ finish_reason = "stop"
380
+ elif reason_name == "MAX_TOKENS":
381
+ finish_reason = "length"
382
+
383
+ return ChatResponse(
384
+ message=ChatMessage(
385
+ role="assistant",
386
+ content=content_text if content_text else None,
387
+ tool_calls=tool_calls,
388
+ ),
389
+ finish_reason=finish_reason,
390
+ input_tokens=getattr(response.usage_metadata, "prompt_token_count", 0)
391
+ if response.usage_metadata
392
+ else 0,
393
+ output_tokens=getattr(response.usage_metadata, "candidates_token_count", 0)
394
+ if response.usage_metadata
395
+ else 0,
396
+ )
397
+
398
+
399
+ class QwenProvider:
400
+ """Provider for Alibaba Qwen API (OpenAI-compatible via DashScope).
401
+
402
+ Qwen models are accessed through the DashScope international API endpoint
403
+ which provides an OpenAI-compatible interface.
404
+ """
405
+
406
+ DASHSCOPE_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
407
+
408
+ def __init__(
409
+ self,
410
+ model: str,
411
+ api_key: str | None = None,
412
+ ) -> None:
413
+ """Initialize Qwen provider.
414
+
415
+ Args:
416
+ model: Qwen model ID (e.g., 'qwen-plus', 'qwen-turbo', 'qwen-max').
417
+ api_key: API key. If None, uses DASHSCOPE_API_KEY env var.
418
+ """
419
+ self._model = model
420
+ self._api_key = api_key or os.environ.get("DASHSCOPE_API_KEY")
421
+ if not self._api_key:
422
+ raise ValueError(
423
+ "DashScope API key required. Set DASHSCOPE_API_KEY environment variable "
424
+ "or pass api_key parameter."
425
+ )
426
+ import openai
427
+
428
+ self._client = openai.OpenAI(
429
+ api_key=self._api_key,
430
+ base_url=self.DASHSCOPE_BASE_URL,
431
+ )
432
+
433
+ @property
434
+ def model(self) -> str:
435
+ return self._model
436
+
437
+ def get_tool_format(self) -> str:
438
+ return "openai"
439
+
440
+ def chat(
441
+ self,
442
+ messages: list[dict[str, Any]],
443
+ tools: list[dict[str, Any]] | None = None,
444
+ max_tokens: int = 4096,
445
+ ) -> ChatResponse:
446
+ kwargs: dict[str, Any] = {
447
+ "model": self._model,
448
+ "messages": messages,
449
+ "max_tokens": max_tokens,
450
+ }
451
+ if tools:
452
+ kwargs["tools"] = tools
453
+
454
+ response = self._client.chat.completions.create(**kwargs)
455
+
456
+ if not response.choices:
457
+ raise RuntimeError("Qwen API returned empty response choices")
458
+
459
+ choice = response.choices[0]
460
+ tool_calls = []
461
+ if choice.message.tool_calls:
462
+ for tc in choice.message.tool_calls:
463
+ tool_calls.append(
464
+ ToolCall(
465
+ id=tc.id,
466
+ name=tc.function.name,
467
+ arguments=tc.function.arguments,
468
+ )
469
+ )
470
+
471
+ return ChatResponse(
472
+ message=ChatMessage(
473
+ role="assistant",
474
+ content=choice.message.content,
475
+ tool_calls=tool_calls,
476
+ ),
477
+ finish_reason=choice.finish_reason,
478
+ input_tokens=response.usage.prompt_tokens,
479
+ output_tokens=response.usage.completion_tokens,
480
+ )
481
+
482
+
173
483
  PROVIDER_REGISTRY: dict[str, type] = {
174
484
  "anthropic": AnthropicProvider,
485
+ "openai": OpenAIProvider,
486
+ "gemini": GeminiProvider,
487
+ "qwen": QwenProvider,
175
488
  }
176
489
 
177
490
 
@@ -183,7 +496,7 @@ def create_provider(
183
496
  """Factory function to create a model provider.
184
497
 
185
498
  Args:
186
- provider_name: Name of the provider (currently only 'anthropic').
499
+ provider_name: Name of the provider ('anthropic', 'openai', 'gemini', 'qwen').
187
500
  model: Model identifier for the provider.
188
501
  api_key: Optional API key.
189
502
 
mcpbr/sdk.py ADDED
@@ -0,0 +1,264 @@
1
+ """Public Python SDK for mcpbr.
2
+
3
+ Provides a programmatic interface for running MCP server benchmarks
4
+ from Python code, without requiring the CLI.
5
+
6
+ Example usage::
7
+
8
+ from mcpbr import MCPBenchmark, list_benchmarks, list_models
9
+
10
+ # List available benchmarks
11
+ for b in list_benchmarks():
12
+ print(b["name"])
13
+
14
+ # Create and run a benchmark
15
+ bench = MCPBenchmark({
16
+ "mcp_server": {
17
+ "command": "npx",
18
+ "args": ["-y", "@modelcontextprotocol/server-filesystem", "{workdir}"],
19
+ },
20
+ "benchmark": "humaneval",
21
+ "model": "sonnet",
22
+ })
23
+
24
+ is_valid, errors = bench.validate()
25
+ plan = bench.dry_run()
26
+
27
+ # Async execution
28
+ import asyncio
29
+ result = asyncio.run(bench.run())
30
+ print(result.success, result.summary)
31
+ """
32
+
33
+ from dataclasses import dataclass
34
+ from pathlib import Path
35
+ from typing import Any
36
+
37
+ from . import __version__
38
+ from .benchmarks import BENCHMARK_REGISTRY
39
+ from .config import VALID_PROVIDERS, HarnessConfig, load_config
40
+ from .models import SUPPORTED_MODELS, validate_model
41
+
42
+
43
+ @dataclass
44
+ class BenchmarkResult:
45
+ """Result of a benchmark run.
46
+
47
+ Attributes:
48
+ success: Whether the benchmark completed successfully.
49
+ summary: Aggregated results (e.g., pass rate, resolved count).
50
+ tasks: Per-task results as a list of dicts.
51
+ metadata: Run metadata (benchmark name, model, timestamps, etc.).
52
+ total_cost: Total API cost in USD.
53
+ total_tokens: Total tokens consumed.
54
+ duration_seconds: Wall-clock duration of the run.
55
+ """
56
+
57
+ success: bool
58
+ summary: dict[str, Any]
59
+ tasks: list[dict[str, Any]]
60
+ metadata: dict[str, Any]
61
+ total_cost: float = 0.0
62
+ total_tokens: int = 0
63
+ duration_seconds: float = 0.0
64
+
65
+
66
+ class MCPBenchmark:
67
+ """High-level interface for configuring and running MCP benchmarks.
68
+
69
+ Can be initialized from a config dict, a YAML file path (str or Path),
70
+ or an existing HarnessConfig instance.
71
+
72
+ Args:
73
+ config: A dict of config values, a path to a YAML config file
74
+ (str or Path), or a HarnessConfig instance.
75
+
76
+ Raises:
77
+ FileNotFoundError: If a file path is given and the file does not exist.
78
+ ValueError: If the config dict is invalid.
79
+ """
80
+
81
+ def __init__(self, config: dict[str, Any] | str | Path | HarnessConfig) -> None:
82
+ if isinstance(config, HarnessConfig):
83
+ self.config: HarnessConfig = config
84
+ elif isinstance(config, (str, Path)):
85
+ path = Path(config)
86
+ if not path.exists():
87
+ raise FileNotFoundError(f"Config file not found: {path}")
88
+ self.config = load_config(path, warn_security=False)
89
+ elif isinstance(config, dict):
90
+ self.config = HarnessConfig(**config)
91
+ else:
92
+ raise TypeError(
93
+ f"config must be a dict, str, Path, or HarnessConfig, got {type(config).__name__}"
94
+ )
95
+
96
+ def validate(self) -> tuple[bool, list[str]]:
97
+ """Validate the current configuration.
98
+
99
+ Checks that the configuration is internally consistent, the model
100
+ is recognized, and required fields are present.
101
+
102
+ Returns:
103
+ A tuple of (is_valid, list_of_warnings_or_errors).
104
+ """
105
+ errors: list[str] = []
106
+
107
+ # Validate model is in the supported registry
108
+ model_valid, model_error = validate_model(self.config.model)
109
+ if not model_valid:
110
+ errors.append(f"Model warning: {model_error}")
111
+
112
+ # Validate benchmark is in the registry
113
+ if self.config.benchmark not in BENCHMARK_REGISTRY:
114
+ errors.append(
115
+ f"Unknown benchmark: {self.config.benchmark}. "
116
+ f"Available: {', '.join(BENCHMARK_REGISTRY.keys())}"
117
+ )
118
+
119
+ # Validate provider
120
+ if self.config.provider not in VALID_PROVIDERS:
121
+ errors.append(
122
+ f"Unknown provider: {self.config.provider}. Available: {', '.join(VALID_PROVIDERS)}"
123
+ )
124
+
125
+ is_valid = len(errors) == 0
126
+ return is_valid, errors
127
+
128
+ def dry_run(self) -> dict[str, Any]:
129
+ """Generate an execution plan without running anything.
130
+
131
+ Returns:
132
+ A dict describing what would be executed, including benchmark,
133
+ model, provider, MCP server config, and runtime settings.
134
+ """
135
+ plan: dict[str, Any] = {
136
+ "benchmark": self.config.benchmark,
137
+ "model": self.config.model,
138
+ "provider": self.config.provider,
139
+ "agent_harness": self.config.agent_harness,
140
+ "timeout_seconds": self.config.timeout_seconds,
141
+ "max_concurrent": self.config.max_concurrent,
142
+ "max_iterations": self.config.max_iterations,
143
+ "sample_size": self.config.sample_size,
144
+ }
145
+
146
+ # Include MCP server info
147
+ if self.config.mcp_server:
148
+ plan["mcp_server"] = {
149
+ "command": self.config.mcp_server.command,
150
+ "args": self.config.mcp_server.args,
151
+ "name": self.config.mcp_server.name,
152
+ }
153
+
154
+ # Include comparison mode info if applicable
155
+ if self.config.comparison_mode:
156
+ plan["comparison_mode"] = True
157
+ if self.config.mcp_server_a:
158
+ plan["mcp_server_a"] = {
159
+ "command": self.config.mcp_server_a.command,
160
+ "args": self.config.mcp_server_a.args,
161
+ "name": self.config.mcp_server_a.name,
162
+ }
163
+ if self.config.mcp_server_b:
164
+ plan["mcp_server_b"] = {
165
+ "command": self.config.mcp_server_b.command,
166
+ "args": self.config.mcp_server_b.args,
167
+ "name": self.config.mcp_server_b.name,
168
+ }
169
+
170
+ # Optional settings
171
+ if self.config.budget is not None:
172
+ plan["budget"] = self.config.budget
173
+ if self.config.thinking_budget is not None:
174
+ plan["thinking_budget"] = self.config.thinking_budget
175
+ if self.config.agent_prompt is not None:
176
+ plan["agent_prompt"] = self.config.agent_prompt
177
+
178
+ return plan
179
+
180
+ async def run(self, **kwargs: Any) -> BenchmarkResult:
181
+ """Execute the benchmark.
182
+
183
+ This is the main entry point for running a benchmark programmatically.
184
+ It delegates to the internal _execute method, which can be overridden
185
+ or mocked for testing.
186
+
187
+ Args:
188
+ **kwargs: Additional keyword arguments passed to the executor.
189
+
190
+ Returns:
191
+ BenchmarkResult with the evaluation results.
192
+ """
193
+ return await self._execute(**kwargs)
194
+
195
+ async def _execute(self, **kwargs: Any) -> BenchmarkResult:
196
+ """Internal execution method.
197
+
198
+ Override or mock this method for testing. In production, this
199
+ would orchestrate the full benchmark pipeline (task loading,
200
+ environment creation, agent execution, evaluation).
201
+
202
+ Args:
203
+ **kwargs: Additional keyword arguments.
204
+
205
+ Returns:
206
+ BenchmarkResult with the evaluation results.
207
+
208
+ Raises:
209
+ NotImplementedError: Full execution pipeline is not yet
210
+ wired into the SDK. Use the CLI for actual runs.
211
+ """
212
+ raise NotImplementedError(
213
+ "Full benchmark execution via the SDK is not yet implemented. "
214
+ "Use the `mcpbr` CLI for actual benchmark runs, or mock "
215
+ "MCPBenchmark._execute for testing."
216
+ )
217
+
218
+
219
+ def list_benchmarks() -> list[dict[str, str]]:
220
+ """List all available benchmarks.
221
+
222
+ Returns:
223
+ A list of dicts, each containing 'name' (the benchmark identifier)
224
+ and 'class' (the benchmark class name).
225
+ """
226
+ return [{"name": name, "class": cls.__name__} for name, cls in BENCHMARK_REGISTRY.items()]
227
+
228
+
229
+ def list_providers() -> list[str]:
230
+ """List all supported model providers.
231
+
232
+ Returns:
233
+ A list of provider name strings.
234
+ """
235
+ return list(VALID_PROVIDERS)
236
+
237
+
238
+ def list_models() -> list[dict[str, str]]:
239
+ """List all supported models with their metadata.
240
+
241
+ Returns:
242
+ A list of dicts, each containing 'id', 'provider',
243
+ 'display_name', 'context_window', 'supports_tools', and 'notes'.
244
+ """
245
+ return [
246
+ {
247
+ "id": info.id,
248
+ "provider": info.provider,
249
+ "display_name": info.display_name,
250
+ "context_window": info.context_window,
251
+ "supports_tools": info.supports_tools,
252
+ "notes": info.notes,
253
+ }
254
+ for info in SUPPORTED_MODELS.values()
255
+ ]
256
+
257
+
258
+ def get_version() -> str:
259
+ """Get the current mcpbr version.
260
+
261
+ Returns:
262
+ The version string (e.g., '0.6.0').
263
+ """
264
+ return __version__
mcpbr/smoke_test.py CHANGED
@@ -7,12 +7,13 @@ from dataclasses import dataclass
7
7
  from pathlib import Path
8
8
  from typing import Any
9
9
 
10
- import docker
11
10
  from anthropic import Anthropic
12
11
  from rich.console import Console
13
12
  from rich.panel import Panel
14
13
  from rich.table import Table
15
14
 
15
+ import docker
16
+
16
17
  from .config import load_config
17
18
  from .config_validator import validate_config
18
19
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcpbr
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
5
5
  Project-URL: Homepage, https://github.com/greynewell/mcpbr
6
6
  Project-URL: Repository, https://github.com/greynewell/mcpbr
@@ -30,6 +30,9 @@ Requires-Dist: pydantic>=2.0.0
30
30
  Requires-Dist: pyyaml>=6.0.0
31
31
  Requires-Dist: requests>=2.31.0
32
32
  Requires-Dist: rich>=13.0.0
33
+ Provides-Extra: all-providers
34
+ Requires-Dist: google-generativeai>=0.3.0; extra == 'all-providers'
35
+ Requires-Dist: openai>=1.0.0; extra == 'all-providers'
33
36
  Provides-Extra: dev
34
37
  Requires-Dist: pre-commit>=3.0.0; extra == 'dev'
35
38
  Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
@@ -40,6 +43,10 @@ Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
40
43
  Requires-Dist: mkdocs-minify-plugin>=0.7.0; extra == 'docs'
41
44
  Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
42
45
  Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
46
+ Provides-Extra: gemini
47
+ Requires-Dist: google-generativeai>=0.3.0; extra == 'gemini'
48
+ Provides-Extra: openai
49
+ Requires-Dist: openai>=1.0.0; extra == 'openai'
43
50
  Description-Content-Type: text/markdown
44
51
 
45
52
  # mcpbr