mcpbr 0.4.12__tar.gz → 0.4.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.claude-plugin/marketplace.json +2 -2
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.claude-plugin/package.json +1 -1
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.claude-plugin/plugin.json +1 -1
- {mcpbr-0.4.12 → mcpbr-0.4.14}/CHANGELOG.md +2 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/PKG-INFO +1 -1
- {mcpbr-0.4.12 → mcpbr-0.4.14}/package.json +1 -1
- {mcpbr-0.4.12 → mcpbr-0.4.14}/pyproject.toml +1 -1
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/config.py +22 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/docker_env.py +15 -4
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/evaluation.py +19 -2
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/harness.py +4 -1
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/harnesses.py +90 -98
- mcpbr-0.4.14/src/mcpbr/swebench_test_specs.py +33 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/infrastructure/test_azure.py +82 -36
- mcpbr-0.4.12/tests/test_django_runner.py → mcpbr-0.4.14/tests/test_build_test_command.py +73 -1
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_trial_mode.py +6 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/uv.lock +1 -1
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.claude/settings.json +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.claude-plugin/README.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.claude-plugin/skills/README.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.claude-plugin/skills/benchmark-swe-lite/SKILL.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.claude-plugin/skills/mcpbr-config/SKILL.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.claude-plugin/skills/mcpbr-eval/SKILL.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.github/dependabot.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.github/release-drafter.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.github/workflows/ci.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.github/workflows/post-release-bump.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.github/workflows/publish-npm.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.github/workflows/publish.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.github/workflows/release-drafter.yml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.gitignore +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/.pre-commit-config.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/AGENTS.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/CLAUDE.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/CODE_OF_CONDUCT.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/CONTRIBUTING.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/Dockerfile +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/HUMANEVAL_FIX_SUMMARY.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/LICENSE +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/Makefile +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/PR_SUMMARY.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/README.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/SECURITY.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/assets/mcpbr-demo.gif +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/assets/mcpbr-eval-results.png +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/assets/mcpbr-logo.jpg +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/bin/mcpbr.js +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/config/example.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/config/humaneval.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/config/supermodel.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/examples/azure-config-example.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/examples/env-vars-example.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/examples/inheritance/README.md +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/examples/inheritance/base-config.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/examples/inheritance/dev-config.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/examples/inheritance/multi-extend-config.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/examples/inheritance/production-config.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/examples/inheritance/shared-mcp-settings.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/examples/local-config-example.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/examples/quick-start/gsm8k-math-reasoning.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/examples/quick-start/test-your-mcp-server.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/install.sh +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/requirements.txt +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/scripts/sync_version.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/scripts/validate_plugin_manifests.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/__init__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/__main__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/agent.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/__init__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/agentbench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/aider_polyglot.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/apps.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/arc.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/base.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/bigbench_hard.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/bigcodebench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/codecontests.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/codereval.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/cybergym.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/gaia.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/gsm8k.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/hellaswag.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/humaneval.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/intercode.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/leetcode.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/math_benchmark.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/mbpp.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/mcptoolbench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/mlagentbench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/repoqa.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/swebench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/terminalbench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/toolbench.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/truthfulqa.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/benchmarks/webarena.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/cache.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/cli.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/config_inheritance.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/config_validator.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/env_expansion.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/incremental_save.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/infrastructure/__init__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/infrastructure/azure.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/infrastructure/azure_health.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/infrastructure/base.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/infrastructure/local.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/infrastructure/manager.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/junit_reporter.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/log_formatter.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/models.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/output_validator.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/preflight.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/pricing.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/profiler.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/providers.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/regression.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/reporting.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/schema.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/smoke_test.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/state_tracker.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/statistics.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/streaming.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/src/mcpbr/templates.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/__init__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/infrastructure/__init__.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/infrastructure/test_azure_health.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/infrastructure/test_base.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/infrastructure/test_cli_infrastructure.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/infrastructure/test_config.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/infrastructure/test_local.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/infrastructure/test_manager.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_agent.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_benchmark_filtering.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_benchmark_integration.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_benchmarks.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_cache.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_claude_plugin.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_cli_templates.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_comparison_aggregation.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_comparison_config.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_comparison_integration.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_comparison_reporting.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_config.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_config_env_vars.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_config_inheritance.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_config_validator.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_config_validator_inheritance.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_cost_calculation.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_default_logging.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_docker_cleanup.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_docker_label_fix.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_docker_retry.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_env_expansion.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_error_messages.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_evaluation.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_exit_codes.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_export.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_git_diff_new_files.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_incremental_save.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_integration.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_junit_reporter.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_log_formatter_read_tool.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_mcp_health_check.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_mcp_logging.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_models.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_output_validator.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_parse_errors.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_preflight.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_pricing.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_profiler.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_regression.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_reporting.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_runtime_tracking.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_schema.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_smoke_test.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_state_tracker.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_statistics.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_statistics_integration.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_streaming.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_string_concat_bug.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_templates.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_thinking_budget.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_timeout_tracking.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_tool_failure_tracking.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_type_safety.py +0 -0
- {mcpbr-0.4.12 → mcpbr-0.4.14}/tests/test_xml_export.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "https://anthropic.com/claude-code/marketplace.schema.json",
|
|
3
3
|
"name": "mcpbr",
|
|
4
|
-
"version": "0.4.
|
|
4
|
+
"version": "0.4.14",
|
|
5
5
|
"description": "mcpbr - MCP Benchmark Runner plugin marketplace",
|
|
6
6
|
"owner": {
|
|
7
7
|
"name": "mcpbr Contributors",
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
{
|
|
12
12
|
"name": "mcpbr",
|
|
13
13
|
"description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
|
|
14
|
-
"version": "0.4.
|
|
14
|
+
"version": "0.4.14",
|
|
15
15
|
"author": {
|
|
16
16
|
"name": "mcpbr Contributors"
|
|
17
17
|
},
|
|
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
13
13
|
|
|
14
14
|
### Fixed
|
|
15
15
|
|
|
16
|
+
- **Repository-aware test commands for non-pytest projects** (#365): Use upstream SWE-bench test command specs for sympy (`bin/test`), sphinx (`tox`), and other non-pytest repos instead of defaulting to `python -m pytest`
|
|
17
|
+
- **Flaky Azure and trial mode tests**: Fixed tests that depended on local `~/.ssh/mcpbr_azure` state and updated assertions for multi-step dependency installation
|
|
16
18
|
- **SEO improvements** for documentation site
|
|
17
19
|
- Added robots.txt with sitemap reference
|
|
18
20
|
- Added Open Graph and Twitter Card meta tags on all pages
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcpbr
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.14
|
|
4
4
|
Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
|
|
5
5
|
Project-URL: Homepage, https://github.com/greynewell/mcpbr
|
|
6
6
|
Project-URL: Repository, https://github.com/greynewell/mcpbr
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mcpbr"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.14"
|
|
8
8
|
description = "Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -109,6 +109,16 @@ class MCPServerConfig(BaseModel):
|
|
|
109
109
|
default=900000,
|
|
110
110
|
description="Timeout in milliseconds for MCP tool execution (default: 15 min for long-running tools)",
|
|
111
111
|
)
|
|
112
|
+
setup_command: str | None = Field(
|
|
113
|
+
default=None,
|
|
114
|
+
description="Shell command to run inside the container BEFORE the agent starts. "
|
|
115
|
+
"Runs outside the task timer (does not count against timeout_seconds). "
|
|
116
|
+
"Use {workdir} as placeholder. Useful for pre-computing caches.",
|
|
117
|
+
)
|
|
118
|
+
setup_timeout_ms: int = Field(
|
|
119
|
+
default=900000,
|
|
120
|
+
description="Timeout in milliseconds for the setup_command (default: 15 min)",
|
|
121
|
+
)
|
|
112
122
|
|
|
113
123
|
def get_args_for_workdir(self, workdir: str) -> list[str]:
|
|
114
124
|
"""Replace {workdir} placeholder in args with actual path."""
|
|
@@ -117,6 +127,12 @@ class MCPServerConfig(BaseModel):
|
|
|
117
127
|
result.append(arg.replace("{workdir}", workdir))
|
|
118
128
|
return result
|
|
119
129
|
|
|
130
|
+
def get_setup_command_for_workdir(self, workdir: str) -> str | None:
|
|
131
|
+
"""Replace {workdir} placeholder in setup_command with actual path."""
|
|
132
|
+
if self.setup_command is None:
|
|
133
|
+
return None
|
|
134
|
+
return self.setup_command.replace("{workdir}", workdir)
|
|
135
|
+
|
|
120
136
|
def get_expanded_env(self) -> dict[str, str]:
|
|
121
137
|
"""Expand ${VAR} references in env values using os.environ.
|
|
122
138
|
|
|
@@ -400,6 +416,12 @@ class HarnessConfig(BaseModel):
|
|
|
400
416
|
description="Enable comprehensive performance profiling (tool latency, memory, overhead)",
|
|
401
417
|
)
|
|
402
418
|
|
|
419
|
+
volumes: dict[str, str] = Field(
|
|
420
|
+
default_factory=dict,
|
|
421
|
+
description="Additional volume mounts (read-write) for Docker containers (host_path: container_path). "
|
|
422
|
+
"Mounted into every container, persists across tasks. Useful for pre-computed caches.",
|
|
423
|
+
)
|
|
424
|
+
|
|
403
425
|
infrastructure: InfrastructureConfig = Field(
|
|
404
426
|
default_factory=InfrastructureConfig,
|
|
405
427
|
description="Infrastructure configuration (local or azure)",
|
|
@@ -314,14 +314,18 @@ class DockerEnvironmentManager:
|
|
|
314
314
|
FALLBACK_IMAGE = "mcpbr-env"
|
|
315
315
|
DOCKERFILE_PATH = Path(__file__).parent.parent.parent / "Dockerfile"
|
|
316
316
|
|
|
317
|
-
def __init__(
|
|
317
|
+
def __init__(
|
|
318
|
+
self, use_prebuilt: bool = True, extra_volumes: dict[str, str] | None = None
|
|
319
|
+
) -> None:
|
|
318
320
|
"""Initialize the Docker environment manager.
|
|
319
321
|
|
|
320
322
|
Args:
|
|
321
323
|
use_prebuilt: If True, try to use pre-built SWE-bench images first.
|
|
324
|
+
extra_volumes: Additional volume mounts (read-write) (host_path -> container_path).
|
|
322
325
|
"""
|
|
323
326
|
self.client = docker.from_env()
|
|
324
327
|
self.use_prebuilt = use_prebuilt
|
|
328
|
+
self._extra_volumes = extra_volumes or {}
|
|
325
329
|
self._fallback_image_built = False
|
|
326
330
|
self._temp_dirs: list[tempfile.TemporaryDirectory[str]] = []
|
|
327
331
|
self._containers: list[Container] = []
|
|
@@ -488,6 +492,15 @@ CMD ["/bin/bash"]
|
|
|
488
492
|
|
|
489
493
|
for attempt in range(max_retries + 1):
|
|
490
494
|
try:
|
|
495
|
+
volumes_dict: dict[str, dict[str, str]] = {
|
|
496
|
+
host_workdir: {"bind": "/workspace", "mode": "rw"},
|
|
497
|
+
}
|
|
498
|
+
for host_path, container_path in self._extra_volumes.items():
|
|
499
|
+
volumes_dict[os.path.abspath(host_path)] = {
|
|
500
|
+
"bind": container_path,
|
|
501
|
+
"mode": "rw",
|
|
502
|
+
}
|
|
503
|
+
|
|
491
504
|
container = self.client.containers.run(
|
|
492
505
|
image_name,
|
|
493
506
|
command="tail -f /dev/null",
|
|
@@ -495,9 +508,7 @@ CMD ["/bin/bash"]
|
|
|
495
508
|
detach=True,
|
|
496
509
|
platform="linux/amd64" if uses_prebuilt else None,
|
|
497
510
|
network_mode="bridge", # Enable network for API calls
|
|
498
|
-
volumes=
|
|
499
|
-
host_workdir: {"bind": "/workspace", "mode": "rw"},
|
|
500
|
-
},
|
|
511
|
+
volumes=volumes_dict,
|
|
501
512
|
working_dir=container_workdir,
|
|
502
513
|
remove=False,
|
|
503
514
|
labels={
|
|
@@ -137,6 +137,7 @@ async def run_tests(
|
|
|
137
137
|
timeout: int = 120,
|
|
138
138
|
uses_prebuilt: bool = False,
|
|
139
139
|
workdir: str | None = None,
|
|
140
|
+
repo: str | None = None,
|
|
140
141
|
) -> TestResults:
|
|
141
142
|
"""Run a list of tests and return results.
|
|
142
143
|
|
|
@@ -146,6 +147,7 @@ async def run_tests(
|
|
|
146
147
|
timeout: Timeout per test in seconds.
|
|
147
148
|
uses_prebuilt: Whether a pre-built SWE-bench image is being used.
|
|
148
149
|
workdir: Working directory to run tests from. Defaults to env.workdir.
|
|
150
|
+
repo: Repository identifier for looking up the correct test runner.
|
|
149
151
|
|
|
150
152
|
Returns:
|
|
151
153
|
TestResults with pass/fail counts.
|
|
@@ -157,7 +159,7 @@ async def run_tests(
|
|
|
157
159
|
passed = 0
|
|
158
160
|
|
|
159
161
|
for test in tests:
|
|
160
|
-
test_cmd = _build_test_command(test, uses_prebuilt)
|
|
162
|
+
test_cmd = _build_test_command(test, uses_prebuilt, repo=repo)
|
|
161
163
|
|
|
162
164
|
try:
|
|
163
165
|
exit_code, stdout, stderr = await env.exec_command(
|
|
@@ -198,7 +200,7 @@ async def run_tests(
|
|
|
198
200
|
)
|
|
199
201
|
|
|
200
202
|
|
|
201
|
-
def _build_test_command(test: str, uses_prebuilt: bool = False) -> str:
|
|
203
|
+
def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None = None) -> str:
|
|
202
204
|
"""Build a test command for the given test identifier.
|
|
203
205
|
|
|
204
206
|
Args:
|
|
@@ -206,18 +208,29 @@ def _build_test_command(test: str, uses_prebuilt: bool = False) -> str:
|
|
|
206
208
|
- pytest: "tests/test_file.py::test_func" or "tests/test_file.py"
|
|
207
209
|
- Django: "test_method (module.TestClass)" or "module.tests.TestClass.test_method"
|
|
208
210
|
uses_prebuilt: If True, activate the testbed conda environment first.
|
|
211
|
+
repo: Repository identifier (e.g., "sympy/sympy") for looking up
|
|
212
|
+
the correct test runner from upstream SWE-bench specs.
|
|
209
213
|
|
|
210
214
|
Returns:
|
|
211
215
|
Shell command string to run the test.
|
|
212
216
|
"""
|
|
213
217
|
import re
|
|
214
218
|
|
|
219
|
+
from .swebench_test_specs import get_repo_test_command
|
|
220
|
+
|
|
215
221
|
# Pre-built SWE-bench images use a conda environment called 'testbed'
|
|
216
222
|
if uses_prebuilt:
|
|
217
223
|
activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && "
|
|
218
224
|
else:
|
|
219
225
|
activate = ""
|
|
220
226
|
|
|
227
|
+
# Check upstream SWE-bench test command mapping for non-pytest runners
|
|
228
|
+
if repo:
|
|
229
|
+
upstream_cmd = get_repo_test_command(repo)
|
|
230
|
+
if upstream_cmd and "runtests.py" not in upstream_cmd and "pytest" not in upstream_cmd:
|
|
231
|
+
# Non-pytest, non-Django project (e.g., sympy uses bin/test)
|
|
232
|
+
return f"{activate}{upstream_cmd} {test}"
|
|
233
|
+
|
|
221
234
|
# Detect Django test format: "test_method (module.TestClass)"
|
|
222
235
|
if "(" in test and ")" in test and "." in test:
|
|
223
236
|
# Extract module path from parentheses
|
|
@@ -344,12 +357,15 @@ async def evaluate_patch(
|
|
|
344
357
|
if not env.uses_prebuilt:
|
|
345
358
|
await _install_dependencies(env)
|
|
346
359
|
|
|
360
|
+
repo = task.get("repo")
|
|
361
|
+
|
|
347
362
|
fail_to_pass_results = await run_tests(
|
|
348
363
|
env,
|
|
349
364
|
fail_to_pass_tests,
|
|
350
365
|
timeout=test_timeout,
|
|
351
366
|
uses_prebuilt=env.uses_prebuilt,
|
|
352
367
|
workdir=eval_workdir,
|
|
368
|
+
repo=repo,
|
|
353
369
|
)
|
|
354
370
|
|
|
355
371
|
pass_to_pass_results = await run_tests(
|
|
@@ -358,6 +374,7 @@ async def evaluate_patch(
|
|
|
358
374
|
timeout=test_timeout,
|
|
359
375
|
uses_prebuilt=env.uses_prebuilt,
|
|
360
376
|
workdir=eval_workdir,
|
|
377
|
+
repo=repo,
|
|
361
378
|
)
|
|
362
379
|
|
|
363
380
|
resolved = (
|
|
@@ -962,7 +962,10 @@ async def run_evaluation(
|
|
|
962
962
|
"args": config.mcp_server.args if config.mcp_server else [],
|
|
963
963
|
}
|
|
964
964
|
|
|
965
|
-
docker_manager = DockerEnvironmentManager(
|
|
965
|
+
docker_manager = DockerEnvironmentManager(
|
|
966
|
+
use_prebuilt=config.use_prebuilt_images,
|
|
967
|
+
extra_volumes=config.volumes,
|
|
968
|
+
)
|
|
966
969
|
|
|
967
970
|
results: list[TaskResult] = []
|
|
968
971
|
# Add cached results if using state tracker
|
|
@@ -452,9 +452,10 @@ DEFAULT_PROMPT = (
|
|
|
452
452
|
)
|
|
453
453
|
|
|
454
454
|
MCP_PROMPT_SUFFIX = (
|
|
455
|
-
"\n\nYou have access to an MCP server with additional tools. "
|
|
456
|
-
"
|
|
457
|
-
"
|
|
455
|
+
"\n\nYou have access to an MCP server with additional tools for codebase analysis. "
|
|
456
|
+
"Use these tools to understand the codebase structure, find definitions, trace call chains, "
|
|
457
|
+
"and navigate dependencies before making changes. The MCP tools are especially useful for "
|
|
458
|
+
"understanding how code is connected across files."
|
|
458
459
|
)
|
|
459
460
|
|
|
460
461
|
|
|
@@ -594,25 +595,27 @@ class ClaudeCodeHarness:
|
|
|
594
595
|
instance_id = task_id or task.get("instance_id", "unknown")
|
|
595
596
|
|
|
596
597
|
mcp_server_name = None
|
|
598
|
+
mcp_json_path = None
|
|
597
599
|
if self.mcp_server:
|
|
598
600
|
mcp_server_name = self.mcp_server.name
|
|
599
601
|
args = self.mcp_server.get_args_for_workdir(workdir)
|
|
600
602
|
mcp_env = self.mcp_server.get_expanded_env()
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
"
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
603
|
+
|
|
604
|
+
# Write .mcp.json file for Claude Code to discover MCP tools.
|
|
605
|
+
# This is more reliable than `claude mcp add` which can create broken
|
|
606
|
+
# tool registrations where the server connects but tools aren't routable.
|
|
607
|
+
mcp_config = {
|
|
608
|
+
"mcpServers": {
|
|
609
|
+
mcp_server_name: {
|
|
610
|
+
"type": "stdio",
|
|
611
|
+
"command": self.mcp_server.command,
|
|
612
|
+
"args": args,
|
|
613
|
+
"env": mcp_env,
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
mcp_json_path = os.path.join(workdir, ".mcp.json")
|
|
618
|
+
Path(mcp_json_path).write_text(json.dumps(mcp_config, indent=2))
|
|
616
619
|
|
|
617
620
|
try:
|
|
618
621
|
command = [
|
|
@@ -683,12 +686,8 @@ class ClaudeCodeHarness:
|
|
|
683
686
|
|
|
684
687
|
if exit_code != 0:
|
|
685
688
|
error_msg = stderr or "Unknown error"
|
|
686
|
-
if
|
|
687
|
-
|
|
688
|
-
["claude", "mcp", "remove", mcp_server_name],
|
|
689
|
-
workdir,
|
|
690
|
-
timeout=10,
|
|
691
|
-
)
|
|
689
|
+
if mcp_json_path and os.path.exists(mcp_json_path):
|
|
690
|
+
os.remove(mcp_json_path)
|
|
692
691
|
return AgentResult(
|
|
693
692
|
patch="",
|
|
694
693
|
success=False,
|
|
@@ -705,12 +704,8 @@ class ClaudeCodeHarness:
|
|
|
705
704
|
cost_usd=cost_usd,
|
|
706
705
|
)
|
|
707
706
|
|
|
708
|
-
if
|
|
709
|
-
|
|
710
|
-
["claude", "mcp", "remove", mcp_server_name],
|
|
711
|
-
workdir,
|
|
712
|
-
timeout=10,
|
|
713
|
-
)
|
|
707
|
+
if mcp_json_path and os.path.exists(mcp_json_path):
|
|
708
|
+
os.remove(mcp_json_path)
|
|
714
709
|
|
|
715
710
|
# Check git status to understand what happened
|
|
716
711
|
git_exit, git_status, git_stderr = await _run_cli_command(
|
|
@@ -747,12 +742,8 @@ class ClaudeCodeHarness:
|
|
|
747
742
|
cost_usd=cost_usd,
|
|
748
743
|
)
|
|
749
744
|
except Exception:
|
|
750
|
-
if
|
|
751
|
-
|
|
752
|
-
["claude", "mcp", "remove", mcp_server_name],
|
|
753
|
-
workdir,
|
|
754
|
-
timeout=10,
|
|
755
|
-
)
|
|
745
|
+
if mcp_json_path and os.path.exists(mcp_json_path):
|
|
746
|
+
os.remove(mcp_json_path)
|
|
756
747
|
raise
|
|
757
748
|
|
|
758
749
|
async def _solve_in_docker(
|
|
@@ -846,37 +837,36 @@ class ClaudeCodeHarness:
|
|
|
846
837
|
self._console.print(f"[cyan]Registering MCP server: {mcp_server_name}[/cyan]")
|
|
847
838
|
self._console.print(f"[dim] Command: {self.mcp_server.command} {args_str}[/dim]")
|
|
848
839
|
|
|
849
|
-
#
|
|
850
|
-
#
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
840
|
+
# Write .mcp.json to workdir for Claude Code to discover MCP tools.
|
|
841
|
+
# File-based config is more reliable than `claude mcp add` which can create
|
|
842
|
+
# broken tool registrations where the server connects but tools aren't routable.
|
|
843
|
+
mcp_config = {
|
|
844
|
+
"mcpServers": {
|
|
845
|
+
mcp_server_name: {
|
|
846
|
+
"type": "stdio",
|
|
847
|
+
"command": self.mcp_server.command,
|
|
848
|
+
"args": args,
|
|
849
|
+
"env": self.mcp_server.get_expanded_env(),
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
mcp_json_content = json.dumps(mcp_config, indent=2)
|
|
854
|
+
mcp_json_path = f"{env.workdir}/.mcp.json"
|
|
862
855
|
|
|
863
856
|
try:
|
|
864
857
|
mcp_exit_code, mcp_stdout, mcp_stderr = await env.exec_command(
|
|
865
|
-
|
|
866
|
-
timeout=
|
|
867
|
-
environment=docker_env,
|
|
858
|
+
f"cat > {mcp_json_path} << 'MCP_JSON_EOF'\n{mcp_json_content}\nMCP_JSON_EOF",
|
|
859
|
+
timeout=10,
|
|
868
860
|
)
|
|
861
|
+
await env.exec_command(f"chown mcpbr:mcpbr {mcp_json_path}", timeout=5)
|
|
869
862
|
|
|
870
863
|
if mcp_exit_code != 0:
|
|
871
|
-
error_msg = f"MCP
|
|
864
|
+
error_msg = f"MCP config write failed (exit {mcp_exit_code})"
|
|
872
865
|
if mcp_stderr:
|
|
873
866
|
error_msg += f": {mcp_stderr}"
|
|
874
|
-
if mcp_stdout:
|
|
875
|
-
error_msg += f"\nStdout: {mcp_stdout}"
|
|
876
867
|
if verbose:
|
|
877
868
|
self._console.print(f"[red]✗ {error_msg}[/red]")
|
|
878
869
|
|
|
879
|
-
# Clean up temp files before early return
|
|
880
870
|
await env.exec_command(f"rm -f {prompt_file} {env_file}", timeout=5)
|
|
881
871
|
|
|
882
872
|
return AgentResult(
|
|
@@ -889,16 +879,13 @@ class ClaudeCodeHarness:
|
|
|
889
879
|
)
|
|
890
880
|
|
|
891
881
|
if verbose:
|
|
892
|
-
self._console.print("[green]✓ MCP server
|
|
893
|
-
if mcp_stdout.strip():
|
|
894
|
-
self._console.print(f"[dim]{mcp_stdout.strip()}[/dim]")
|
|
882
|
+
self._console.print("[green]✓ MCP server configured via .mcp.json[/green]")
|
|
895
883
|
|
|
896
884
|
except asyncio.TimeoutError:
|
|
897
|
-
error_msg = "
|
|
885
|
+
error_msg = "Failed to write MCP configuration file."
|
|
898
886
|
if verbose:
|
|
899
887
|
self._console.print(f"[red]✗ {error_msg}[/red]")
|
|
900
888
|
|
|
901
|
-
# Clean up temp files before early return
|
|
902
889
|
await env.exec_command(f"rm -f {prompt_file} {env_file}", timeout=5)
|
|
903
890
|
|
|
904
891
|
return AgentResult(
|
|
@@ -908,6 +895,35 @@ class ClaudeCodeHarness:
|
|
|
908
895
|
cost_usd=None,
|
|
909
896
|
)
|
|
910
897
|
|
|
898
|
+
# Run setup_command if configured (BEFORE agent, OUTSIDE task timer).
|
|
899
|
+
# This is the right place for expensive one-time operations like
|
|
900
|
+
# pre-computing caches that should not count against timeout_seconds.
|
|
901
|
+
if self.mcp_server and self.mcp_server.setup_command:
|
|
902
|
+
setup_cmd = self.mcp_server.get_setup_command_for_workdir(env.workdir)
|
|
903
|
+
setup_timeout = int(self.mcp_server.setup_timeout_ms / 1000)
|
|
904
|
+
|
|
905
|
+
if verbose:
|
|
906
|
+
self._console.print(
|
|
907
|
+
f"[cyan]Running setup command (timeout: {setup_timeout:.0f}s)...[/cyan]"
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
setup_full_cmd = f"source {shlex.quote(env_file)} && {setup_cmd}"
|
|
911
|
+
setup_exit, _setup_stdout, setup_stderr = await env.exec_command(
|
|
912
|
+
["/bin/bash", "-c", setup_full_cmd],
|
|
913
|
+
timeout=setup_timeout,
|
|
914
|
+
)
|
|
915
|
+
|
|
916
|
+
if setup_exit != 0:
|
|
917
|
+
if verbose:
|
|
918
|
+
self._console.print(
|
|
919
|
+
f"[yellow]⚠ Setup command exited with code {setup_exit}[/yellow]"
|
|
920
|
+
)
|
|
921
|
+
if setup_stderr:
|
|
922
|
+
self._console.print(f"[dim]{setup_stderr[:500]}[/dim]")
|
|
923
|
+
# Non-fatal: continue with agent even if setup fails
|
|
924
|
+
elif verbose:
|
|
925
|
+
self._console.print("[green]✓ Setup command completed[/green]")
|
|
926
|
+
|
|
911
927
|
try:
|
|
912
928
|
claude_args = [
|
|
913
929
|
"--print",
|
|
@@ -1039,16 +1055,9 @@ class ClaudeCodeHarness:
|
|
|
1039
1055
|
error_msg += f"\nMCP server logs saved to: {mcp_log_path}"
|
|
1040
1056
|
|
|
1041
1057
|
if mcp_server_name:
|
|
1042
|
-
# Use shlex.quote() for MCP removal command
|
|
1043
|
-
quoted_env_file = shlex.quote(env_file)
|
|
1044
|
-
quoted_server_name = shlex.quote(mcp_server_name)
|
|
1045
|
-
remove_cmd = (
|
|
1046
|
-
f"source {quoted_env_file} && claude mcp remove {quoted_server_name}"
|
|
1047
|
-
)
|
|
1048
1058
|
await env.exec_command(
|
|
1049
|
-
f"
|
|
1050
|
-
timeout=
|
|
1051
|
-
environment=docker_env,
|
|
1059
|
+
f"rm -f {env.workdir}/.mcp.json",
|
|
1060
|
+
timeout=5,
|
|
1052
1061
|
)
|
|
1053
1062
|
|
|
1054
1063
|
return AgentResult(
|
|
@@ -1068,14 +1077,9 @@ class ClaudeCodeHarness:
|
|
|
1068
1077
|
)
|
|
1069
1078
|
|
|
1070
1079
|
if mcp_server_name:
|
|
1071
|
-
# Use shlex.quote() for MCP removal command
|
|
1072
|
-
quoted_env_file = shlex.quote(env_file)
|
|
1073
|
-
quoted_server_name = shlex.quote(mcp_server_name)
|
|
1074
|
-
remove_cmd = f"source {quoted_env_file} && claude mcp remove {quoted_server_name}"
|
|
1075
1080
|
await env.exec_command(
|
|
1076
|
-
f"
|
|
1077
|
-
timeout=
|
|
1078
|
-
environment=docker_env,
|
|
1081
|
+
f"rm -f {env.workdir}/.mcp.json",
|
|
1082
|
+
timeout=5,
|
|
1079
1083
|
)
|
|
1080
1084
|
|
|
1081
1085
|
_, git_status, git_stderr = await env.exec_command(
|
|
@@ -1160,20 +1164,13 @@ class ClaudeCodeHarness:
|
|
|
1160
1164
|
|
|
1161
1165
|
if mcp_server_name:
|
|
1162
1166
|
try:
|
|
1163
|
-
# Use shlex.quote() for MCP removal command
|
|
1164
|
-
quoted_env_file = shlex.quote(env_file)
|
|
1165
|
-
quoted_server_name = shlex.quote(mcp_server_name)
|
|
1166
|
-
remove_cmd = (
|
|
1167
|
-
f"source {quoted_env_file} && claude mcp remove {quoted_server_name}"
|
|
1168
|
-
)
|
|
1169
1167
|
await env.exec_command(
|
|
1170
|
-
f"
|
|
1171
|
-
timeout=
|
|
1172
|
-
environment=docker_env,
|
|
1168
|
+
f"rm -f {env.workdir}/.mcp.json",
|
|
1169
|
+
timeout=5,
|
|
1173
1170
|
)
|
|
1174
1171
|
except Exception as e:
|
|
1175
1172
|
if verbose:
|
|
1176
|
-
self._console.print(f"[dim red]Failed to
|
|
1173
|
+
self._console.print(f"[dim red]Failed to clean up .mcp.json: {e}[/dim red]")
|
|
1177
1174
|
|
|
1178
1175
|
error_msg = f"Task execution timed out after {timeout}s."
|
|
1179
1176
|
if self.mcp_server:
|
|
@@ -1204,20 +1201,13 @@ class ClaudeCodeHarness:
|
|
|
1204
1201
|
except Exception:
|
|
1205
1202
|
if mcp_server_name:
|
|
1206
1203
|
try:
|
|
1207
|
-
# Use shlex.quote() for MCP removal command
|
|
1208
|
-
quoted_env_file = shlex.quote(env_file)
|
|
1209
|
-
quoted_server_name = shlex.quote(mcp_server_name)
|
|
1210
|
-
remove_cmd = (
|
|
1211
|
-
f"source {quoted_env_file} && claude mcp remove {quoted_server_name}"
|
|
1212
|
-
)
|
|
1213
1204
|
await env.exec_command(
|
|
1214
|
-
f"
|
|
1215
|
-
timeout=
|
|
1216
|
-
environment=docker_env,
|
|
1205
|
+
f"rm -f {env.workdir}/.mcp.json",
|
|
1206
|
+
timeout=5,
|
|
1217
1207
|
)
|
|
1218
1208
|
except Exception as e:
|
|
1219
1209
|
if verbose:
|
|
1220
|
-
self._console.print(f"[dim red]Failed to
|
|
1210
|
+
self._console.print(f"[dim red]Failed to clean up .mcp.json: {e}[/dim red]")
|
|
1221
1211
|
raise
|
|
1222
1212
|
finally:
|
|
1223
1213
|
# Close MCP log file if it was opened
|
|
@@ -1230,7 +1220,9 @@ class ClaudeCodeHarness:
|
|
|
1230
1220
|
if verbose:
|
|
1231
1221
|
self._console.print(f"[dim red]Failed to close MCP log file: {e}[/dim red]")
|
|
1232
1222
|
|
|
1233
|
-
await env.exec_command(
|
|
1223
|
+
await env.exec_command(
|
|
1224
|
+
f"rm -f {prompt_file} {env_file} {env.workdir}/.mcp.json", timeout=5
|
|
1225
|
+
)
|
|
1234
1226
|
|
|
1235
1227
|
|
|
1236
1228
|
HARNESS_REGISTRY: dict[str, type] = {
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Test command specs from upstream SWE-bench harness.
|
|
2
|
+
|
|
3
|
+
Maps repositories to their correct test commands. mcpbr defaults to pytest
|
|
4
|
+
for all non-Django projects, but some projects (e.g., sympy) use custom test
|
|
5
|
+
runners that aren't pytest-compatible.
|
|
6
|
+
|
|
7
|
+
Source: https://github.com/SWE-bench/SWE-bench/blob/main/swebench/harness/constants/python.py
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# Base test commands per framework (from upstream constants/python.py)
|
|
11
|
+
TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider"
|
|
12
|
+
TEST_DJANGO = "./tests/runtests.py --verbosity 2 --settings=test_sqlite --parallel 1"
|
|
13
|
+
TEST_SYMPY = "PYTHONWARNINGS='ignore::UserWarning,ignore::SyntaxWarning' bin/test -C --verbose"
|
|
14
|
+
TEST_SPHINX = "tox --current-env -epy39 -v --"
|
|
15
|
+
TEST_ASTROPY = "pytest -rA -vv -o console_output_style=classic --tb=no"
|
|
16
|
+
TEST_SEABORN = "pytest --no-header -rA"
|
|
17
|
+
|
|
18
|
+
# Repo → test command mapping
|
|
19
|
+
# Only non-pytest entries need to be here — pytest is the default fallback.
|
|
20
|
+
# Django is included for documentation but its existing handler takes precedence.
|
|
21
|
+
REPO_TO_TEST_CMD: dict[str, str] = {
|
|
22
|
+
"sympy/sympy": TEST_SYMPY,
|
|
23
|
+
"django/django": TEST_DJANGO,
|
|
24
|
+
"sphinx-doc/sphinx": TEST_SPHINX,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_repo_test_command(repo: str) -> str | None:
|
|
29
|
+
"""Look up the upstream test command for a repo.
|
|
30
|
+
|
|
31
|
+
Returns None if repo uses standard pytest (handled by existing logic).
|
|
32
|
+
"""
|
|
33
|
+
return REPO_TO_TEST_CMD.get(repo)
|