mcpbr 0.4.12__tar.gz → 0.4.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.claude-plugin/marketplace.json +2 -2
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.claude-plugin/package.json +1 -1
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.claude-plugin/plugin.json +1 -1
- {mcpbr-0.4.12 → mcpbr-0.4.13}/PKG-INFO +1 -1
- {mcpbr-0.4.12 → mcpbr-0.4.13}/package.json +1 -1
- {mcpbr-0.4.12 → mcpbr-0.4.13}/pyproject.toml +1 -1
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/harnesses.py +61 -98
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.claude/settings.json +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.claude-plugin/README.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.claude-plugin/skills/README.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.claude-plugin/skills/benchmark-swe-lite/SKILL.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.claude-plugin/skills/mcpbr-config/SKILL.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.claude-plugin/skills/mcpbr-eval/SKILL.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.github/dependabot.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.github/release-drafter.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.github/workflows/ci.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.github/workflows/post-release-bump.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.github/workflows/publish-npm.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.github/workflows/publish.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.github/workflows/release-drafter.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.gitignore +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/.pre-commit-config.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/AGENTS.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/CHANGELOG.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/CLAUDE.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/CODE_OF_CONDUCT.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/CONTRIBUTING.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/Dockerfile +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/HUMANEVAL_FIX_SUMMARY.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/LICENSE +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/Makefile +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/PR_SUMMARY.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/README.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/SECURITY.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/assets/mcpbr-demo.gif +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/assets/mcpbr-eval-results.png +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/assets/mcpbr-logo.jpg +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/bin/mcpbr.js +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/config/example.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/config/humaneval.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/config/supermodel.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/examples/azure-config-example.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/examples/env-vars-example.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/examples/inheritance/README.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/examples/inheritance/base-config.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/examples/inheritance/dev-config.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/examples/inheritance/multi-extend-config.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/examples/inheritance/production-config.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/examples/inheritance/shared-mcp-settings.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/examples/local-config-example.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/examples/quick-start/gsm8k-math-reasoning.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/examples/quick-start/test-your-mcp-server.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/install.sh +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/requirements.txt +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/scripts/sync_version.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/scripts/validate_plugin_manifests.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/__init__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/__main__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/agent.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/__init__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/agentbench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/aider_polyglot.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/apps.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/arc.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/base.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/bigbench_hard.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/bigcodebench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/codecontests.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/codereval.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/cybergym.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/gaia.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/gsm8k.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/hellaswag.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/humaneval.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/intercode.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/leetcode.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/math_benchmark.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/mbpp.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/mcptoolbench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/mlagentbench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/repoqa.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/swebench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/terminalbench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/toolbench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/truthfulqa.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/benchmarks/webarena.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/cache.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/cli.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/config.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/config_inheritance.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/config_validator.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/docker_env.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/env_expansion.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/evaluation.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/harness.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/incremental_save.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/infrastructure/__init__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/infrastructure/azure.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/infrastructure/azure_health.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/infrastructure/base.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/infrastructure/local.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/infrastructure/manager.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/junit_reporter.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/log_formatter.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/models.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/output_validator.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/preflight.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/pricing.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/profiler.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/providers.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/regression.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/reporting.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/schema.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/smoke_test.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/state_tracker.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/statistics.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/streaming.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/src/mcpbr/templates.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/__init__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/infrastructure/__init__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/infrastructure/test_azure.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/infrastructure/test_azure_health.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/infrastructure/test_base.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/infrastructure/test_cli_infrastructure.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/infrastructure/test_config.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/infrastructure/test_local.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/infrastructure/test_manager.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_agent.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_benchmark_filtering.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_benchmark_integration.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_benchmarks.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_cache.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_claude_plugin.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_cli_templates.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_comparison_aggregation.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_comparison_config.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_comparison_integration.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_comparison_reporting.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_config.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_config_env_vars.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_config_inheritance.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_config_validator.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_config_validator_inheritance.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_cost_calculation.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_default_logging.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_django_runner.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_docker_cleanup.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_docker_label_fix.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_docker_retry.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_env_expansion.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_error_messages.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_evaluation.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_exit_codes.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_export.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_git_diff_new_files.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_incremental_save.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_integration.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_junit_reporter.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_log_formatter_read_tool.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_mcp_health_check.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_mcp_logging.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_models.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_output_validator.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_parse_errors.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_preflight.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_pricing.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_profiler.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_regression.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_reporting.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_runtime_tracking.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_schema.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_smoke_test.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_state_tracker.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_statistics.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_statistics_integration.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_streaming.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_string_concat_bug.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_templates.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_thinking_budget.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_timeout_tracking.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_tool_failure_tracking.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_trial_mode.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_type_safety.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/tests/test_xml_export.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.13}/uv.lock +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "https://anthropic.com/claude-code/marketplace.schema.json",
|
|
3
3
|
"name": "mcpbr",
|
|
4
|
-
"version": "0.4.
|
|
4
|
+
"version": "0.4.13",
|
|
5
5
|
"description": "mcpbr - MCP Benchmark Runner plugin marketplace",
|
|
6
6
|
"owner": {
|
|
7
7
|
"name": "mcpbr Contributors",
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
{
|
|
12
12
|
"name": "mcpbr",
|
|
13
13
|
"description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
|
|
14
|
-
"version": "0.4.
|
|
14
|
+
"version": "0.4.13",
|
|
15
15
|
"author": {
|
|
16
16
|
"name": "mcpbr Contributors"
|
|
17
17
|
},
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcpbr
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.13
|
|
4
4
|
Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
|
|
5
5
|
Project-URL: Homepage, https://github.com/greynewell/mcpbr
|
|
6
6
|
Project-URL: Repository, https://github.com/greynewell/mcpbr
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mcpbr"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.13"
|
|
8
8
|
description = "Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -452,9 +452,10 @@ DEFAULT_PROMPT = (
|
|
|
452
452
|
)
|
|
453
453
|
|
|
454
454
|
MCP_PROMPT_SUFFIX = (
|
|
455
|
-
"\n\nYou have access to an MCP server with additional tools. "
|
|
456
|
-
"
|
|
457
|
-
"
|
|
455
|
+
"\n\nYou have access to an MCP server with additional tools for codebase analysis. "
|
|
456
|
+
"Use these tools to understand the codebase structure, find definitions, trace call chains, "
|
|
457
|
+
"and navigate dependencies before making changes. The MCP tools are especially useful for "
|
|
458
|
+
"understanding how code is connected across files."
|
|
458
459
|
)
|
|
459
460
|
|
|
460
461
|
|
|
@@ -594,25 +595,27 @@ class ClaudeCodeHarness:
|
|
|
594
595
|
instance_id = task_id or task.get("instance_id", "unknown")
|
|
595
596
|
|
|
596
597
|
mcp_server_name = None
|
|
598
|
+
mcp_json_path = None
|
|
597
599
|
if self.mcp_server:
|
|
598
600
|
mcp_server_name = self.mcp_server.name
|
|
599
601
|
args = self.mcp_server.get_args_for_workdir(workdir)
|
|
600
602
|
mcp_env = self.mcp_server.get_expanded_env()
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
"
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
603
|
+
|
|
604
|
+
# Write .mcp.json file for Claude Code to discover MCP tools.
|
|
605
|
+
# This is more reliable than `claude mcp add` which can create broken
|
|
606
|
+
# tool registrations where the server connects but tools aren't routable.
|
|
607
|
+
mcp_config = {
|
|
608
|
+
"mcpServers": {
|
|
609
|
+
mcp_server_name: {
|
|
610
|
+
"type": "stdio",
|
|
611
|
+
"command": self.mcp_server.command,
|
|
612
|
+
"args": args,
|
|
613
|
+
"env": mcp_env,
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
mcp_json_path = os.path.join(workdir, ".mcp.json")
|
|
618
|
+
Path(mcp_json_path).write_text(json.dumps(mcp_config, indent=2))
|
|
616
619
|
|
|
617
620
|
try:
|
|
618
621
|
command = [
|
|
@@ -683,12 +686,8 @@ class ClaudeCodeHarness:
|
|
|
683
686
|
|
|
684
687
|
if exit_code != 0:
|
|
685
688
|
error_msg = stderr or "Unknown error"
|
|
686
|
-
if
|
|
687
|
-
|
|
688
|
-
["claude", "mcp", "remove", mcp_server_name],
|
|
689
|
-
workdir,
|
|
690
|
-
timeout=10,
|
|
691
|
-
)
|
|
689
|
+
if mcp_json_path and os.path.exists(mcp_json_path):
|
|
690
|
+
os.remove(mcp_json_path)
|
|
692
691
|
return AgentResult(
|
|
693
692
|
patch="",
|
|
694
693
|
success=False,
|
|
@@ -705,12 +704,8 @@ class ClaudeCodeHarness:
|
|
|
705
704
|
cost_usd=cost_usd,
|
|
706
705
|
)
|
|
707
706
|
|
|
708
|
-
if
|
|
709
|
-
|
|
710
|
-
["claude", "mcp", "remove", mcp_server_name],
|
|
711
|
-
workdir,
|
|
712
|
-
timeout=10,
|
|
713
|
-
)
|
|
707
|
+
if mcp_json_path and os.path.exists(mcp_json_path):
|
|
708
|
+
os.remove(mcp_json_path)
|
|
714
709
|
|
|
715
710
|
# Check git status to understand what happened
|
|
716
711
|
git_exit, git_status, git_stderr = await _run_cli_command(
|
|
@@ -747,12 +742,8 @@ class ClaudeCodeHarness:
|
|
|
747
742
|
cost_usd=cost_usd,
|
|
748
743
|
)
|
|
749
744
|
except Exception:
|
|
750
|
-
if
|
|
751
|
-
|
|
752
|
-
["claude", "mcp", "remove", mcp_server_name],
|
|
753
|
-
workdir,
|
|
754
|
-
timeout=10,
|
|
755
|
-
)
|
|
745
|
+
if mcp_json_path and os.path.exists(mcp_json_path):
|
|
746
|
+
os.remove(mcp_json_path)
|
|
756
747
|
raise
|
|
757
748
|
|
|
758
749
|
async def _solve_in_docker(
|
|
@@ -846,37 +837,36 @@ class ClaudeCodeHarness:
|
|
|
846
837
|
self._console.print(f"[cyan]Registering MCP server: {mcp_server_name}[/cyan]")
|
|
847
838
|
self._console.print(f"[dim] Command: {self.mcp_server.command} {args_str}[/dim]")
|
|
848
839
|
|
|
849
|
-
#
|
|
850
|
-
#
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
840
|
+
# Write .mcp.json to workdir for Claude Code to discover MCP tools.
|
|
841
|
+
# File-based config is more reliable than `claude mcp add` which can create
|
|
842
|
+
# broken tool registrations where the server connects but tools aren't routable.
|
|
843
|
+
mcp_config = {
|
|
844
|
+
"mcpServers": {
|
|
845
|
+
mcp_server_name: {
|
|
846
|
+
"type": "stdio",
|
|
847
|
+
"command": self.mcp_server.command,
|
|
848
|
+
"args": args,
|
|
849
|
+
"env": self.mcp_server.get_expanded_env(),
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
mcp_json_content = json.dumps(mcp_config, indent=2)
|
|
854
|
+
mcp_json_path = f"{env.workdir}/.mcp.json"
|
|
862
855
|
|
|
863
856
|
try:
|
|
864
857
|
mcp_exit_code, mcp_stdout, mcp_stderr = await env.exec_command(
|
|
865
|
-
|
|
866
|
-
timeout=
|
|
867
|
-
environment=docker_env,
|
|
858
|
+
f"cat > {mcp_json_path} << 'MCP_JSON_EOF'\n{mcp_json_content}\nMCP_JSON_EOF",
|
|
859
|
+
timeout=10,
|
|
868
860
|
)
|
|
861
|
+
await env.exec_command(f"chown mcpbr:mcpbr {mcp_json_path}", timeout=5)
|
|
869
862
|
|
|
870
863
|
if mcp_exit_code != 0:
|
|
871
|
-
error_msg = f"MCP
|
|
864
|
+
error_msg = f"MCP config write failed (exit {mcp_exit_code})"
|
|
872
865
|
if mcp_stderr:
|
|
873
866
|
error_msg += f": {mcp_stderr}"
|
|
874
|
-
if mcp_stdout:
|
|
875
|
-
error_msg += f"\nStdout: {mcp_stdout}"
|
|
876
867
|
if verbose:
|
|
877
868
|
self._console.print(f"[red]✗ {error_msg}[/red]")
|
|
878
869
|
|
|
879
|
-
# Clean up temp files before early return
|
|
880
870
|
await env.exec_command(f"rm -f {prompt_file} {env_file}", timeout=5)
|
|
881
871
|
|
|
882
872
|
return AgentResult(
|
|
@@ -889,16 +879,13 @@ class ClaudeCodeHarness:
|
|
|
889
879
|
)
|
|
890
880
|
|
|
891
881
|
if verbose:
|
|
892
|
-
self._console.print("[green]✓ MCP server
|
|
893
|
-
if mcp_stdout.strip():
|
|
894
|
-
self._console.print(f"[dim]{mcp_stdout.strip()}[/dim]")
|
|
882
|
+
self._console.print("[green]✓ MCP server configured via .mcp.json[/green]")
|
|
895
883
|
|
|
896
884
|
except asyncio.TimeoutError:
|
|
897
|
-
error_msg = "
|
|
885
|
+
error_msg = "Failed to write MCP configuration file."
|
|
898
886
|
if verbose:
|
|
899
887
|
self._console.print(f"[red]✗ {error_msg}[/red]")
|
|
900
888
|
|
|
901
|
-
# Clean up temp files before early return
|
|
902
889
|
await env.exec_command(f"rm -f {prompt_file} {env_file}", timeout=5)
|
|
903
890
|
|
|
904
891
|
return AgentResult(
|
|
@@ -1039,16 +1026,9 @@ class ClaudeCodeHarness:
|
|
|
1039
1026
|
error_msg += f"\nMCP server logs saved to: {mcp_log_path}"
|
|
1040
1027
|
|
|
1041
1028
|
if mcp_server_name:
|
|
1042
|
-
# Use shlex.quote() for MCP removal command
|
|
1043
|
-
quoted_env_file = shlex.quote(env_file)
|
|
1044
|
-
quoted_server_name = shlex.quote(mcp_server_name)
|
|
1045
|
-
remove_cmd = (
|
|
1046
|
-
f"source {quoted_env_file} && claude mcp remove {quoted_server_name}"
|
|
1047
|
-
)
|
|
1048
1029
|
await env.exec_command(
|
|
1049
|
-
f"
|
|
1050
|
-
timeout=
|
|
1051
|
-
environment=docker_env,
|
|
1030
|
+
f"rm -f {env.workdir}/.mcp.json",
|
|
1031
|
+
timeout=5,
|
|
1052
1032
|
)
|
|
1053
1033
|
|
|
1054
1034
|
return AgentResult(
|
|
@@ -1068,14 +1048,9 @@ class ClaudeCodeHarness:
|
|
|
1068
1048
|
)
|
|
1069
1049
|
|
|
1070
1050
|
if mcp_server_name:
|
|
1071
|
-
# Use shlex.quote() for MCP removal command
|
|
1072
|
-
quoted_env_file = shlex.quote(env_file)
|
|
1073
|
-
quoted_server_name = shlex.quote(mcp_server_name)
|
|
1074
|
-
remove_cmd = f"source {quoted_env_file} && claude mcp remove {quoted_server_name}"
|
|
1075
1051
|
await env.exec_command(
|
|
1076
|
-
f"
|
|
1077
|
-
timeout=
|
|
1078
|
-
environment=docker_env,
|
|
1052
|
+
f"rm -f {env.workdir}/.mcp.json",
|
|
1053
|
+
timeout=5,
|
|
1079
1054
|
)
|
|
1080
1055
|
|
|
1081
1056
|
_, git_status, git_stderr = await env.exec_command(
|
|
@@ -1160,20 +1135,13 @@ class ClaudeCodeHarness:
|
|
|
1160
1135
|
|
|
1161
1136
|
if mcp_server_name:
|
|
1162
1137
|
try:
|
|
1163
|
-
# Use shlex.quote() for MCP removal command
|
|
1164
|
-
quoted_env_file = shlex.quote(env_file)
|
|
1165
|
-
quoted_server_name = shlex.quote(mcp_server_name)
|
|
1166
|
-
remove_cmd = (
|
|
1167
|
-
f"source {quoted_env_file} && claude mcp remove {quoted_server_name}"
|
|
1168
|
-
)
|
|
1169
1138
|
await env.exec_command(
|
|
1170
|
-
f"
|
|
1171
|
-
timeout=
|
|
1172
|
-
environment=docker_env,
|
|
1139
|
+
f"rm -f {env.workdir}/.mcp.json",
|
|
1140
|
+
timeout=5,
|
|
1173
1141
|
)
|
|
1174
1142
|
except Exception as e:
|
|
1175
1143
|
if verbose:
|
|
1176
|
-
self._console.print(f"[dim red]Failed to
|
|
1144
|
+
self._console.print(f"[dim red]Failed to clean up .mcp.json: {e}[/dim red]")
|
|
1177
1145
|
|
|
1178
1146
|
error_msg = f"Task execution timed out after {timeout}s."
|
|
1179
1147
|
if self.mcp_server:
|
|
@@ -1204,20 +1172,13 @@ class ClaudeCodeHarness:
|
|
|
1204
1172
|
except Exception:
|
|
1205
1173
|
if mcp_server_name:
|
|
1206
1174
|
try:
|
|
1207
|
-
# Use shlex.quote() for MCP removal command
|
|
1208
|
-
quoted_env_file = shlex.quote(env_file)
|
|
1209
|
-
quoted_server_name = shlex.quote(mcp_server_name)
|
|
1210
|
-
remove_cmd = (
|
|
1211
|
-
f"source {quoted_env_file} && claude mcp remove {quoted_server_name}"
|
|
1212
|
-
)
|
|
1213
1175
|
await env.exec_command(
|
|
1214
|
-
f"
|
|
1215
|
-
timeout=
|
|
1216
|
-
environment=docker_env,
|
|
1176
|
+
f"rm -f {env.workdir}/.mcp.json",
|
|
1177
|
+
timeout=5,
|
|
1217
1178
|
)
|
|
1218
1179
|
except Exception as e:
|
|
1219
1180
|
if verbose:
|
|
1220
|
-
self._console.print(f"[dim red]Failed to
|
|
1181
|
+
self._console.print(f"[dim red]Failed to clean up .mcp.json: {e}[/dim red]")
|
|
1221
1182
|
raise
|
|
1222
1183
|
finally:
|
|
1223
1184
|
# Close MCP log file if it was opened
|
|
@@ -1230,7 +1191,9 @@ class ClaudeCodeHarness:
|
|
|
1230
1191
|
if verbose:
|
|
1231
1192
|
self._console.print(f"[dim red]Failed to close MCP log file: {e}[/dim red]")
|
|
1232
1193
|
|
|
1233
|
-
await env.exec_command(
|
|
1194
|
+
await env.exec_command(
|
|
1195
|
+
f"rm -f {prompt_file} {env_file} {env.workdir}/.mcp.json", timeout=5
|
|
1196
|
+
)
|
|
1234
1197
|
|
|
1235
1198
|
|
|
1236
1199
|
HARNESS_REGISTRY: dict[str, type] = {
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|