mcpbr 0.4.13__tar.gz → 0.4.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/marketplace.json +2 -2
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/package.json +1 -1
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/plugin.json +1 -1
- {mcpbr-0.4.13 → mcpbr-0.4.14}/CHANGELOG.md +2 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/PKG-INFO +1 -1
- {mcpbr-0.4.13 → mcpbr-0.4.14}/package.json +1 -1
- {mcpbr-0.4.13 → mcpbr-0.4.14}/pyproject.toml +1 -1
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/config.py +22 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/docker_env.py +15 -4
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/evaluation.py +19 -2
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/harness.py +4 -1
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/harnesses.py +29 -0
- mcpbr-0.4.14/src/mcpbr/swebench_test_specs.py +33 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_azure.py +82 -36
- mcpbr-0.4.13/tests/test_django_runner.py → mcpbr-0.4.14/tests/test_build_test_command.py +73 -1
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_trial_mode.py +6 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/uv.lock +1 -1
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude/settings.json +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/README.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/skills/README.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/skills/benchmark-swe-lite/SKILL.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/skills/mcpbr-config/SKILL.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/skills/mcpbr-eval/SKILL.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/dependabot.yml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/release-drafter.yml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/workflows/ci.yml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/workflows/post-release-bump.yml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/workflows/publish-npm.yml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/workflows/publish.yml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/workflows/release-drafter.yml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.gitignore +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/.pre-commit-config.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/AGENTS.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/CLAUDE.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/CODE_OF_CONDUCT.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/CONTRIBUTING.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/Dockerfile +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/HUMANEVAL_FIX_SUMMARY.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/LICENSE +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/Makefile +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/PR_SUMMARY.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/README.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/SECURITY.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/assets/mcpbr-demo.gif +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/assets/mcpbr-eval-results.png +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/assets/mcpbr-logo.jpg +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/bin/mcpbr.js +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/config/example.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/config/humaneval.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/config/supermodel.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/azure-config-example.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/env-vars-example.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/README.md +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/base-config.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/dev-config.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/multi-extend-config.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/production-config.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/shared-mcp-settings.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/local-config-example.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/quick-start/gsm8k-math-reasoning.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/quick-start/test-your-mcp-server.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/install.sh +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/requirements.txt +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/scripts/sync_version.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/scripts/validate_plugin_manifests.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/__init__.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/__main__.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/agent.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/__init__.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/agentbench.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/aider_polyglot.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/apps.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/arc.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/base.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/bigbench_hard.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/bigcodebench.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/codecontests.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/codereval.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/cybergym.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/gaia.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/gsm8k.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/hellaswag.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/humaneval.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/intercode.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/leetcode.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/math_benchmark.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/mbpp.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/mcptoolbench.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/mlagentbench.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/repoqa.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/swebench.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/terminalbench.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/toolbench.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/truthfulqa.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/webarena.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/cache.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/cli.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/config_inheritance.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/config_validator.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/env_expansion.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/incremental_save.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/__init__.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/azure.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/azure_health.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/base.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/local.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/manager.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/junit_reporter.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/log_formatter.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/models.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/output_validator.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/preflight.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/pricing.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/profiler.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/providers.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/regression.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/reporting.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/schema.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/smoke_test.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/state_tracker.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/statistics.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/streaming.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/templates.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/__init__.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/__init__.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_azure_health.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_base.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_cli_infrastructure.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_config.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_local.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_manager.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_agent.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_benchmark_filtering.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_benchmark_integration.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_benchmarks.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_cache.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_claude_plugin.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_cli_templates.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_comparison_aggregation.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_comparison_config.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_comparison_integration.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_comparison_reporting.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_config.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_config_env_vars.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_config_inheritance.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_config_validator.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_config_validator_inheritance.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_cost_calculation.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_default_logging.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_docker_cleanup.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_docker_label_fix.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_docker_retry.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_env_expansion.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_error_messages.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_evaluation.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_exit_codes.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_export.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_git_diff_new_files.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_incremental_save.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_integration.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_junit_reporter.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_log_formatter_read_tool.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_mcp_health_check.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_mcp_logging.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_models.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_output_validator.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_parse_errors.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_preflight.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_pricing.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_profiler.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_regression.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_reporting.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_runtime_tracking.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_schema.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_smoke_test.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_state_tracker.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_statistics.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_statistics_integration.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_streaming.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_string_concat_bug.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_templates.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_thinking_budget.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_timeout_tracking.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_tool_failure_tracking.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_type_safety.py +0 -0
- {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_xml_export.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "https://anthropic.com/claude-code/marketplace.schema.json",
|
|
3
3
|
"name": "mcpbr",
|
|
4
|
-
"version": "0.4.
|
|
4
|
+
"version": "0.4.14",
|
|
5
5
|
"description": "mcpbr - MCP Benchmark Runner plugin marketplace",
|
|
6
6
|
"owner": {
|
|
7
7
|
"name": "mcpbr Contributors",
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
{
|
|
12
12
|
"name": "mcpbr",
|
|
13
13
|
"description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
|
|
14
|
-
"version": "0.4.
|
|
14
|
+
"version": "0.4.14",
|
|
15
15
|
"author": {
|
|
16
16
|
"name": "mcpbr Contributors"
|
|
17
17
|
},
|
|
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
13
13
|
|
|
14
14
|
### Fixed
|
|
15
15
|
|
|
16
|
+
- **Repository-aware test commands for non-pytest projects** (#365): Use upstream SWE-bench test command specs for sympy (`bin/test`), sphinx (`tox`), and other non-pytest repos instead of defaulting to `python -m pytest`
|
|
17
|
+
- **Flaky Azure and trial mode tests**: Fixed tests that depended on local `~/.ssh/mcpbr_azure` state and updated assertions for multi-step dependency installation
|
|
16
18
|
- **SEO improvements** for documentation site
|
|
17
19
|
- Added robots.txt with sitemap reference
|
|
18
20
|
- Added Open Graph and Twitter Card meta tags on all pages
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcpbr
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.14
|
|
4
4
|
Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
|
|
5
5
|
Project-URL: Homepage, https://github.com/greynewell/mcpbr
|
|
6
6
|
Project-URL: Repository, https://github.com/greynewell/mcpbr
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mcpbr"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.14"
|
|
8
8
|
description = "Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -109,6 +109,16 @@ class MCPServerConfig(BaseModel):
|
|
|
109
109
|
default=900000,
|
|
110
110
|
description="Timeout in milliseconds for MCP tool execution (default: 15 min for long-running tools)",
|
|
111
111
|
)
|
|
112
|
+
setup_command: str | None = Field(
|
|
113
|
+
default=None,
|
|
114
|
+
description="Shell command to run inside the container BEFORE the agent starts. "
|
|
115
|
+
"Runs outside the task timer (does not count against timeout_seconds). "
|
|
116
|
+
"Use {workdir} as placeholder. Useful for pre-computing caches.",
|
|
117
|
+
)
|
|
118
|
+
setup_timeout_ms: int = Field(
|
|
119
|
+
default=900000,
|
|
120
|
+
description="Timeout in milliseconds for the setup_command (default: 15 min)",
|
|
121
|
+
)
|
|
112
122
|
|
|
113
123
|
def get_args_for_workdir(self, workdir: str) -> list[str]:
|
|
114
124
|
"""Replace {workdir} placeholder in args with actual path."""
|
|
@@ -117,6 +127,12 @@ class MCPServerConfig(BaseModel):
|
|
|
117
127
|
result.append(arg.replace("{workdir}", workdir))
|
|
118
128
|
return result
|
|
119
129
|
|
|
130
|
+
def get_setup_command_for_workdir(self, workdir: str) -> str | None:
|
|
131
|
+
"""Replace {workdir} placeholder in setup_command with actual path."""
|
|
132
|
+
if self.setup_command is None:
|
|
133
|
+
return None
|
|
134
|
+
return self.setup_command.replace("{workdir}", workdir)
|
|
135
|
+
|
|
120
136
|
def get_expanded_env(self) -> dict[str, str]:
|
|
121
137
|
"""Expand ${VAR} references in env values using os.environ.
|
|
122
138
|
|
|
@@ -400,6 +416,12 @@ class HarnessConfig(BaseModel):
|
|
|
400
416
|
description="Enable comprehensive performance profiling (tool latency, memory, overhead)",
|
|
401
417
|
)
|
|
402
418
|
|
|
419
|
+
volumes: dict[str, str] = Field(
|
|
420
|
+
default_factory=dict,
|
|
421
|
+
description="Additional volume mounts (read-write) for Docker containers (host_path: container_path). "
|
|
422
|
+
"Mounted into every container, persists across tasks. Useful for pre-computed caches.",
|
|
423
|
+
)
|
|
424
|
+
|
|
403
425
|
infrastructure: InfrastructureConfig = Field(
|
|
404
426
|
default_factory=InfrastructureConfig,
|
|
405
427
|
description="Infrastructure configuration (local or azure)",
|
|
@@ -314,14 +314,18 @@ class DockerEnvironmentManager:
|
|
|
314
314
|
FALLBACK_IMAGE = "mcpbr-env"
|
|
315
315
|
DOCKERFILE_PATH = Path(__file__).parent.parent.parent / "Dockerfile"
|
|
316
316
|
|
|
317
|
-
def __init__(
|
|
317
|
+
def __init__(
|
|
318
|
+
self, use_prebuilt: bool = True, extra_volumes: dict[str, str] | None = None
|
|
319
|
+
) -> None:
|
|
318
320
|
"""Initialize the Docker environment manager.
|
|
319
321
|
|
|
320
322
|
Args:
|
|
321
323
|
use_prebuilt: If True, try to use pre-built SWE-bench images first.
|
|
324
|
+
extra_volumes: Additional volume mounts (read-write) (host_path -> container_path).
|
|
322
325
|
"""
|
|
323
326
|
self.client = docker.from_env()
|
|
324
327
|
self.use_prebuilt = use_prebuilt
|
|
328
|
+
self._extra_volumes = extra_volumes or {}
|
|
325
329
|
self._fallback_image_built = False
|
|
326
330
|
self._temp_dirs: list[tempfile.TemporaryDirectory[str]] = []
|
|
327
331
|
self._containers: list[Container] = []
|
|
@@ -488,6 +492,15 @@ CMD ["/bin/bash"]
|
|
|
488
492
|
|
|
489
493
|
for attempt in range(max_retries + 1):
|
|
490
494
|
try:
|
|
495
|
+
volumes_dict: dict[str, dict[str, str]] = {
|
|
496
|
+
host_workdir: {"bind": "/workspace", "mode": "rw"},
|
|
497
|
+
}
|
|
498
|
+
for host_path, container_path in self._extra_volumes.items():
|
|
499
|
+
volumes_dict[os.path.abspath(host_path)] = {
|
|
500
|
+
"bind": container_path,
|
|
501
|
+
"mode": "rw",
|
|
502
|
+
}
|
|
503
|
+
|
|
491
504
|
container = self.client.containers.run(
|
|
492
505
|
image_name,
|
|
493
506
|
command="tail -f /dev/null",
|
|
@@ -495,9 +508,7 @@ CMD ["/bin/bash"]
|
|
|
495
508
|
detach=True,
|
|
496
509
|
platform="linux/amd64" if uses_prebuilt else None,
|
|
497
510
|
network_mode="bridge", # Enable network for API calls
|
|
498
|
-
volumes=
|
|
499
|
-
host_workdir: {"bind": "/workspace", "mode": "rw"},
|
|
500
|
-
},
|
|
511
|
+
volumes=volumes_dict,
|
|
501
512
|
working_dir=container_workdir,
|
|
502
513
|
remove=False,
|
|
503
514
|
labels={
|
|
@@ -137,6 +137,7 @@ async def run_tests(
|
|
|
137
137
|
timeout: int = 120,
|
|
138
138
|
uses_prebuilt: bool = False,
|
|
139
139
|
workdir: str | None = None,
|
|
140
|
+
repo: str | None = None,
|
|
140
141
|
) -> TestResults:
|
|
141
142
|
"""Run a list of tests and return results.
|
|
142
143
|
|
|
@@ -146,6 +147,7 @@ async def run_tests(
|
|
|
146
147
|
timeout: Timeout per test in seconds.
|
|
147
148
|
uses_prebuilt: Whether a pre-built SWE-bench image is being used.
|
|
148
149
|
workdir: Working directory to run tests from. Defaults to env.workdir.
|
|
150
|
+
repo: Repository identifier for looking up the correct test runner.
|
|
149
151
|
|
|
150
152
|
Returns:
|
|
151
153
|
TestResults with pass/fail counts.
|
|
@@ -157,7 +159,7 @@ async def run_tests(
|
|
|
157
159
|
passed = 0
|
|
158
160
|
|
|
159
161
|
for test in tests:
|
|
160
|
-
test_cmd = _build_test_command(test, uses_prebuilt)
|
|
162
|
+
test_cmd = _build_test_command(test, uses_prebuilt, repo=repo)
|
|
161
163
|
|
|
162
164
|
try:
|
|
163
165
|
exit_code, stdout, stderr = await env.exec_command(
|
|
@@ -198,7 +200,7 @@ async def run_tests(
|
|
|
198
200
|
)
|
|
199
201
|
|
|
200
202
|
|
|
201
|
-
def _build_test_command(test: str, uses_prebuilt: bool = False) -> str:
|
|
203
|
+
def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None = None) -> str:
|
|
202
204
|
"""Build a test command for the given test identifier.
|
|
203
205
|
|
|
204
206
|
Args:
|
|
@@ -206,18 +208,29 @@ def _build_test_command(test: str, uses_prebuilt: bool = False) -> str:
|
|
|
206
208
|
- pytest: "tests/test_file.py::test_func" or "tests/test_file.py"
|
|
207
209
|
- Django: "test_method (module.TestClass)" or "module.tests.TestClass.test_method"
|
|
208
210
|
uses_prebuilt: If True, activate the testbed conda environment first.
|
|
211
|
+
repo: Repository identifier (e.g., "sympy/sympy") for looking up
|
|
212
|
+
the correct test runner from upstream SWE-bench specs.
|
|
209
213
|
|
|
210
214
|
Returns:
|
|
211
215
|
Shell command string to run the test.
|
|
212
216
|
"""
|
|
213
217
|
import re
|
|
214
218
|
|
|
219
|
+
from .swebench_test_specs import get_repo_test_command
|
|
220
|
+
|
|
215
221
|
# Pre-built SWE-bench images use a conda environment called 'testbed'
|
|
216
222
|
if uses_prebuilt:
|
|
217
223
|
activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && "
|
|
218
224
|
else:
|
|
219
225
|
activate = ""
|
|
220
226
|
|
|
227
|
+
# Check upstream SWE-bench test command mapping for non-pytest runners
|
|
228
|
+
if repo:
|
|
229
|
+
upstream_cmd = get_repo_test_command(repo)
|
|
230
|
+
if upstream_cmd and "runtests.py" not in upstream_cmd and "pytest" not in upstream_cmd:
|
|
231
|
+
# Non-pytest, non-Django project (e.g., sympy uses bin/test)
|
|
232
|
+
return f"{activate}{upstream_cmd} {test}"
|
|
233
|
+
|
|
221
234
|
# Detect Django test format: "test_method (module.TestClass)"
|
|
222
235
|
if "(" in test and ")" in test and "." in test:
|
|
223
236
|
# Extract module path from parentheses
|
|
@@ -344,12 +357,15 @@ async def evaluate_patch(
|
|
|
344
357
|
if not env.uses_prebuilt:
|
|
345
358
|
await _install_dependencies(env)
|
|
346
359
|
|
|
360
|
+
repo = task.get("repo")
|
|
361
|
+
|
|
347
362
|
fail_to_pass_results = await run_tests(
|
|
348
363
|
env,
|
|
349
364
|
fail_to_pass_tests,
|
|
350
365
|
timeout=test_timeout,
|
|
351
366
|
uses_prebuilt=env.uses_prebuilt,
|
|
352
367
|
workdir=eval_workdir,
|
|
368
|
+
repo=repo,
|
|
353
369
|
)
|
|
354
370
|
|
|
355
371
|
pass_to_pass_results = await run_tests(
|
|
@@ -358,6 +374,7 @@ async def evaluate_patch(
|
|
|
358
374
|
timeout=test_timeout,
|
|
359
375
|
uses_prebuilt=env.uses_prebuilt,
|
|
360
376
|
workdir=eval_workdir,
|
|
377
|
+
repo=repo,
|
|
361
378
|
)
|
|
362
379
|
|
|
363
380
|
resolved = (
|
|
@@ -962,7 +962,10 @@ async def run_evaluation(
|
|
|
962
962
|
"args": config.mcp_server.args if config.mcp_server else [],
|
|
963
963
|
}
|
|
964
964
|
|
|
965
|
-
docker_manager = DockerEnvironmentManager(
|
|
965
|
+
docker_manager = DockerEnvironmentManager(
|
|
966
|
+
use_prebuilt=config.use_prebuilt_images,
|
|
967
|
+
extra_volumes=config.volumes,
|
|
968
|
+
)
|
|
966
969
|
|
|
967
970
|
results: list[TaskResult] = []
|
|
968
971
|
# Add cached results if using state tracker
|
|
@@ -895,6 +895,35 @@ class ClaudeCodeHarness:
|
|
|
895
895
|
cost_usd=None,
|
|
896
896
|
)
|
|
897
897
|
|
|
898
|
+
# Run setup_command if configured (BEFORE agent, OUTSIDE task timer).
|
|
899
|
+
# This is the right place for expensive one-time operations like
|
|
900
|
+
# pre-computing caches that should not count against timeout_seconds.
|
|
901
|
+
if self.mcp_server and self.mcp_server.setup_command:
|
|
902
|
+
setup_cmd = self.mcp_server.get_setup_command_for_workdir(env.workdir)
|
|
903
|
+
setup_timeout = int(self.mcp_server.setup_timeout_ms / 1000)
|
|
904
|
+
|
|
905
|
+
if verbose:
|
|
906
|
+
self._console.print(
|
|
907
|
+
f"[cyan]Running setup command (timeout: {setup_timeout:.0f}s)...[/cyan]"
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
setup_full_cmd = f"source {shlex.quote(env_file)} && {setup_cmd}"
|
|
911
|
+
setup_exit, _setup_stdout, setup_stderr = await env.exec_command(
|
|
912
|
+
["/bin/bash", "-c", setup_full_cmd],
|
|
913
|
+
timeout=setup_timeout,
|
|
914
|
+
)
|
|
915
|
+
|
|
916
|
+
if setup_exit != 0:
|
|
917
|
+
if verbose:
|
|
918
|
+
self._console.print(
|
|
919
|
+
f"[yellow]⚠ Setup command exited with code {setup_exit}[/yellow]"
|
|
920
|
+
)
|
|
921
|
+
if setup_stderr:
|
|
922
|
+
self._console.print(f"[dim]{setup_stderr[:500]}[/dim]")
|
|
923
|
+
# Non-fatal: continue with agent even if setup fails
|
|
924
|
+
elif verbose:
|
|
925
|
+
self._console.print("[green]✓ Setup command completed[/green]")
|
|
926
|
+
|
|
898
927
|
try:
|
|
899
928
|
claude_args = [
|
|
900
929
|
"--print",
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Test command specs from upstream SWE-bench harness.
|
|
2
|
+
|
|
3
|
+
Maps repositories to their correct test commands. mcpbr defaults to pytest
|
|
4
|
+
for all non-Django projects, but some projects (e.g., sympy) use custom test
|
|
5
|
+
runners that aren't pytest-compatible.
|
|
6
|
+
|
|
7
|
+
Source: https://github.com/SWE-bench/SWE-bench/blob/main/swebench/harness/constants/python.py
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# Base test commands per framework (from upstream constants/python.py)
|
|
11
|
+
TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider"
|
|
12
|
+
TEST_DJANGO = "./tests/runtests.py --verbosity 2 --settings=test_sqlite --parallel 1"
|
|
13
|
+
TEST_SYMPY = "PYTHONWARNINGS='ignore::UserWarning,ignore::SyntaxWarning' bin/test -C --verbose"
|
|
14
|
+
TEST_SPHINX = "tox --current-env -epy39 -v --"
|
|
15
|
+
TEST_ASTROPY = "pytest -rA -vv -o console_output_style=classic --tb=no"
|
|
16
|
+
TEST_SEABORN = "pytest --no-header -rA"
|
|
17
|
+
|
|
18
|
+
# Repo → test command mapping
|
|
19
|
+
# Only non-pytest entries need to be here — pytest is the default fallback.
|
|
20
|
+
# Django is included for documentation but its existing handler takes precedence.
|
|
21
|
+
REPO_TO_TEST_CMD: dict[str, str] = {
|
|
22
|
+
"sympy/sympy": TEST_SYMPY,
|
|
23
|
+
"django/django": TEST_DJANGO,
|
|
24
|
+
"sphinx-doc/sphinx": TEST_SPHINX,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_repo_test_command(repo: str) -> str | None:
|
|
29
|
+
"""Look up the upstream test command for a repo.
|
|
30
|
+
|
|
31
|
+
Returns None if repo uses standard pytest (handled by existing logic).
|
|
32
|
+
"""
|
|
33
|
+
return REPO_TO_TEST_CMD.get(repo)
|
|
@@ -155,11 +155,15 @@ class TestVMProvisioning:
|
|
|
155
155
|
mock_time: MagicMock,
|
|
156
156
|
mock_run: MagicMock,
|
|
157
157
|
azure_provider: AzureProvider,
|
|
158
|
+
tmp_path: Path,
|
|
158
159
|
) -> None:
|
|
159
160
|
"""Test successful VM creation."""
|
|
160
|
-
#
|
|
161
|
+
# Use existing SSH key to avoid depending on ~/.ssh/mcpbr_azure state
|
|
162
|
+
ssh_key = tmp_path / "test_key"
|
|
163
|
+
ssh_key.touch()
|
|
164
|
+
azure_provider.azure_config.ssh_key_path = ssh_key
|
|
165
|
+
|
|
161
166
|
mock_run.side_effect = [
|
|
162
|
-
Mock(returncode=0), # ssh-keygen
|
|
163
167
|
Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show (exists)
|
|
164
168
|
Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
|
|
165
169
|
]
|
|
@@ -179,11 +183,15 @@ class TestVMProvisioning:
|
|
|
179
183
|
mock_time: MagicMock,
|
|
180
184
|
mock_run: MagicMock,
|
|
181
185
|
azure_provider: AzureProvider,
|
|
186
|
+
tmp_path: Path,
|
|
182
187
|
) -> None:
|
|
183
188
|
"""Test VM creation with resource group creation."""
|
|
184
|
-
#
|
|
189
|
+
# Use existing SSH key to avoid depending on ~/.ssh/mcpbr_azure state
|
|
190
|
+
ssh_key = tmp_path / "test_key"
|
|
191
|
+
ssh_key.touch()
|
|
192
|
+
azure_provider.azure_config.ssh_key_path = ssh_key
|
|
193
|
+
|
|
185
194
|
mock_run.side_effect = [
|
|
186
|
-
Mock(returncode=0), # ssh-keygen
|
|
187
195
|
Mock(returncode=1, stderr="ResourceGroupNotFound"), # az group show (not found)
|
|
188
196
|
Mock(returncode=0), # az group create
|
|
189
197
|
Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
|
|
@@ -198,16 +206,19 @@ class TestVMProvisioning:
|
|
|
198
206
|
self,
|
|
199
207
|
mock_run: MagicMock,
|
|
200
208
|
azure_provider: AzureProvider,
|
|
209
|
+
tmp_path: Path,
|
|
201
210
|
) -> None:
|
|
202
211
|
"""Test VM creation with SSH key generation."""
|
|
203
|
-
#
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
212
|
+
# Redirect Path.home to tmp_path so ~/.ssh/mcpbr_azure doesn't exist
|
|
213
|
+
with patch("mcpbr.infrastructure.azure.Path.home", return_value=tmp_path):
|
|
214
|
+
# Mock ssh-keygen, resource group show, and vm creation
|
|
215
|
+
mock_run.side_effect = [
|
|
216
|
+
Mock(returncode=0), # ssh-keygen
|
|
217
|
+
Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
|
|
218
|
+
Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
|
|
219
|
+
]
|
|
209
220
|
|
|
210
|
-
|
|
221
|
+
await azure_provider._create_vm("Standard_D8s_v3")
|
|
211
222
|
|
|
212
223
|
# Verify ssh-keygen was called
|
|
213
224
|
ssh_keygen_call = mock_run.call_args_list[0]
|
|
@@ -218,11 +229,15 @@ class TestVMProvisioning:
|
|
|
218
229
|
self,
|
|
219
230
|
mock_run: MagicMock,
|
|
220
231
|
azure_provider: AzureProvider,
|
|
232
|
+
tmp_path: Path,
|
|
221
233
|
) -> None:
|
|
222
234
|
"""Test VM creation failure (quota exceeded)."""
|
|
223
|
-
#
|
|
235
|
+
# Use existing SSH key to avoid depending on ~/.ssh/mcpbr_azure state
|
|
236
|
+
ssh_key = tmp_path / "test_key"
|
|
237
|
+
ssh_key.touch()
|
|
238
|
+
azure_provider.azure_config.ssh_key_path = ssh_key
|
|
239
|
+
|
|
224
240
|
mock_run.side_effect = [
|
|
225
|
-
Mock(returncode=0), # ssh-keygen
|
|
226
241
|
Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
|
|
227
242
|
Mock(returncode=1, stderr="QuotaExceeded: Core quota exceeded"), # az vm create
|
|
228
243
|
]
|
|
@@ -577,13 +592,18 @@ class TestSetup:
|
|
|
577
592
|
mock_ssh_client: MagicMock,
|
|
578
593
|
mock_run: MagicMock,
|
|
579
594
|
azure_provider: AzureProvider,
|
|
595
|
+
tmp_path: Path,
|
|
580
596
|
) -> None:
|
|
581
597
|
"""Test full setup flow (create VM, wait SSH, get IP, install, config, test)."""
|
|
582
598
|
mock_env_get.return_value = "test-api-key"
|
|
583
599
|
|
|
584
|
-
#
|
|
600
|
+
# Use existing SSH key to avoid depending on ~/.ssh/mcpbr_azure state
|
|
601
|
+
ssh_key = tmp_path / "test_key"
|
|
602
|
+
ssh_key.touch()
|
|
603
|
+
azure_provider.azure_config.ssh_key_path = ssh_key
|
|
604
|
+
|
|
605
|
+
# Mock resource group show, vm create, vm show (no ssh-keygen needed)
|
|
585
606
|
mock_run.side_effect = [
|
|
586
|
-
Mock(returncode=0), # ssh-keygen
|
|
587
607
|
Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
|
|
588
608
|
Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
|
|
589
609
|
Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show (note: quoted string in JSON)
|
|
@@ -618,11 +638,16 @@ class TestSetup:
|
|
|
618
638
|
mock_time: MagicMock,
|
|
619
639
|
mock_run: MagicMock,
|
|
620
640
|
azure_provider: AzureProvider,
|
|
641
|
+
tmp_path: Path,
|
|
621
642
|
) -> None:
|
|
622
643
|
"""Test setup failure rolls back VM creation."""
|
|
623
|
-
#
|
|
644
|
+
# Use existing SSH key to avoid depending on ~/.ssh/mcpbr_azure state
|
|
645
|
+
ssh_key = tmp_path / "test_key"
|
|
646
|
+
ssh_key.touch()
|
|
647
|
+
azure_provider.azure_config.ssh_key_path = ssh_key
|
|
648
|
+
|
|
649
|
+
# Mock resource group show, VM creation success, IP retrieval failure
|
|
624
650
|
mock_run.side_effect = [
|
|
625
|
-
Mock(returncode=0), # ssh-keygen
|
|
626
651
|
Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
|
|
627
652
|
Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
|
|
628
653
|
Mock(returncode=1, stderr="VM not found"), # az vm show (failure)
|
|
@@ -687,11 +712,12 @@ class TestSetup:
|
|
|
687
712
|
mock_ssh_client: MagicMock,
|
|
688
713
|
mock_run: MagicMock,
|
|
689
714
|
azure_provider: AzureProvider,
|
|
715
|
+
tmp_path: Path,
|
|
690
716
|
) -> None:
|
|
691
717
|
"""Test setup with generated SSH key."""
|
|
692
718
|
mock_env_get.return_value = "test-api-key"
|
|
693
719
|
|
|
694
|
-
# No SSH key configured
|
|
720
|
+
# No SSH key configured - redirect home to tmp_path so key doesn't exist
|
|
695
721
|
azure_provider.azure_config.ssh_key_path = None
|
|
696
722
|
|
|
697
723
|
mock_run.side_effect = [
|
|
@@ -717,7 +743,8 @@ class TestSetup:
|
|
|
717
743
|
mock_sftp = MagicMock()
|
|
718
744
|
mock_client.open_sftp.return_value = mock_sftp
|
|
719
745
|
|
|
720
|
-
|
|
746
|
+
with patch("mcpbr.infrastructure.azure.Path.home", return_value=tmp_path):
|
|
747
|
+
await azure_provider.setup()
|
|
721
748
|
|
|
722
749
|
# Verify ssh-keygen was called
|
|
723
750
|
ssh_keygen_call = mock_run.call_args_list[0]
|
|
@@ -793,12 +820,13 @@ class TestEnvironmentSetup:
|
|
|
793
820
|
|
|
794
821
|
await azure_provider._install_dependencies()
|
|
795
822
|
|
|
796
|
-
# Verify
|
|
797
|
-
mock_client.exec_command.
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
assert "
|
|
801
|
-
assert "
|
|
823
|
+
# Verify all 4 steps were executed (Docker, Python, Node.js, mcpbr)
|
|
824
|
+
assert mock_client.exec_command.call_count == 4
|
|
825
|
+
all_cmds = [call[0][0] for call in mock_client.exec_command.call_args_list]
|
|
826
|
+
all_cmds_str = " ".join(all_cmds)
|
|
827
|
+
assert "apt-get update" in all_cmds_str
|
|
828
|
+
assert "docker" in all_cmds_str.lower()
|
|
829
|
+
assert "pip install mcpbr" in all_cmds_str
|
|
802
830
|
|
|
803
831
|
async def test_install_dependencies_handles_failures_gracefully(
|
|
804
832
|
self,
|
|
@@ -819,7 +847,8 @@ class TestEnvironmentSetup:
|
|
|
819
847
|
# Should not raise - just log warning
|
|
820
848
|
await azure_provider._install_dependencies()
|
|
821
849
|
|
|
822
|
-
|
|
850
|
+
# All 4 steps still execute even if individual steps fail
|
|
851
|
+
assert mock_client.exec_command.call_count == 4
|
|
823
852
|
|
|
824
853
|
async def test_install_dependencies_installs_docker(
|
|
825
854
|
self,
|
|
@@ -839,8 +868,9 @@ class TestEnvironmentSetup:
|
|
|
839
868
|
|
|
840
869
|
await azure_provider._install_dependencies()
|
|
841
870
|
|
|
842
|
-
|
|
843
|
-
|
|
871
|
+
# Docker install is the first step
|
|
872
|
+
all_cmds = [call[0][0] for call in mock_client.exec_command.call_args_list]
|
|
873
|
+
assert any("get.docker.com" in cmd for cmd in all_cmds)
|
|
844
874
|
|
|
845
875
|
async def test_install_dependencies_installs_python_version(
|
|
846
876
|
self,
|
|
@@ -879,8 +909,9 @@ class TestEnvironmentSetup:
|
|
|
879
909
|
|
|
880
910
|
await azure_provider._install_dependencies()
|
|
881
911
|
|
|
882
|
-
|
|
883
|
-
|
|
912
|
+
# mcpbr install is the last step
|
|
913
|
+
all_cmds = [call[0][0] for call in mock_client.exec_command.call_args_list]
|
|
914
|
+
assert any("pip install mcpbr" in cmd for cmd in all_cmds)
|
|
884
915
|
|
|
885
916
|
|
|
886
917
|
# ============================================================================
|
|
@@ -1209,13 +1240,16 @@ class TestUpdatedSetup:
|
|
|
1209
1240
|
mock_ssh_client: MagicMock,
|
|
1210
1241
|
mock_run: MagicMock,
|
|
1211
1242
|
azure_provider: AzureProvider,
|
|
1243
|
+
tmp_path: Path,
|
|
1212
1244
|
) -> None:
|
|
1213
1245
|
"""Test full setup flow includes dependency installation."""
|
|
1214
1246
|
mock_env_get.return_value = "test-api-key"
|
|
1247
|
+
ssh_key = tmp_path / "test_key"
|
|
1248
|
+
ssh_key.touch()
|
|
1249
|
+
azure_provider.azure_config.ssh_key_path = ssh_key
|
|
1215
1250
|
|
|
1216
|
-
# Mock subprocess calls
|
|
1251
|
+
# Mock subprocess calls (no ssh-keygen needed with existing key)
|
|
1217
1252
|
mock_run.side_effect = [
|
|
1218
|
-
Mock(returncode=0), # ssh-keygen
|
|
1219
1253
|
Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
|
|
1220
1254
|
Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
|
|
1221
1255
|
Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show
|
|
@@ -1259,12 +1293,15 @@ class TestUpdatedSetup:
|
|
|
1259
1293
|
mock_ssh_client: MagicMock,
|
|
1260
1294
|
mock_run: MagicMock,
|
|
1261
1295
|
azure_provider: AzureProvider,
|
|
1296
|
+
tmp_path: Path,
|
|
1262
1297
|
) -> None:
|
|
1263
1298
|
"""Test full setup flow includes config transfer."""
|
|
1264
1299
|
mock_env_get.return_value = "test-api-key"
|
|
1300
|
+
ssh_key = tmp_path / "test_key"
|
|
1301
|
+
ssh_key.touch()
|
|
1302
|
+
azure_provider.azure_config.ssh_key_path = ssh_key
|
|
1265
1303
|
|
|
1266
1304
|
mock_run.side_effect = [
|
|
1267
|
-
Mock(returncode=0), # ssh-keygen
|
|
1268
1305
|
Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
|
|
1269
1306
|
Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
|
|
1270
1307
|
Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show
|
|
@@ -1301,12 +1338,15 @@ class TestUpdatedSetup:
|
|
|
1301
1338
|
mock_ssh_client: MagicMock,
|
|
1302
1339
|
mock_run: MagicMock,
|
|
1303
1340
|
azure_provider: AzureProvider,
|
|
1341
|
+
tmp_path: Path,
|
|
1304
1342
|
) -> None:
|
|
1305
1343
|
"""Test full setup flow includes env var export."""
|
|
1306
1344
|
mock_env_get.return_value = "test-api-key"
|
|
1345
|
+
ssh_key = tmp_path / "test_key"
|
|
1346
|
+
ssh_key.touch()
|
|
1347
|
+
azure_provider.azure_config.ssh_key_path = ssh_key
|
|
1307
1348
|
|
|
1308
1349
|
mock_run.side_effect = [
|
|
1309
|
-
Mock(returncode=0), # ssh-keygen
|
|
1310
1350
|
Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
|
|
1311
1351
|
Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
|
|
1312
1352
|
Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show
|
|
@@ -1343,12 +1383,15 @@ class TestUpdatedSetup:
|
|
|
1343
1383
|
mock_ssh_client: MagicMock,
|
|
1344
1384
|
mock_run: MagicMock,
|
|
1345
1385
|
azure_provider: AzureProvider,
|
|
1386
|
+
tmp_path: Path,
|
|
1346
1387
|
) -> None:
|
|
1347
1388
|
"""Test full setup flow includes test task."""
|
|
1348
1389
|
mock_env_get.return_value = "test-api-key"
|
|
1390
|
+
ssh_key = tmp_path / "test_key"
|
|
1391
|
+
ssh_key.touch()
|
|
1392
|
+
azure_provider.azure_config.ssh_key_path = ssh_key
|
|
1349
1393
|
|
|
1350
1394
|
mock_run.side_effect = [
|
|
1351
|
-
Mock(returncode=0), # ssh-keygen
|
|
1352
1395
|
Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
|
|
1353
1396
|
Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
|
|
1354
1397
|
Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show
|
|
@@ -1385,12 +1428,15 @@ class TestUpdatedSetup:
|
|
|
1385
1428
|
mock_ssh_client: MagicMock,
|
|
1386
1429
|
mock_run: MagicMock,
|
|
1387
1430
|
azure_provider: AzureProvider,
|
|
1431
|
+
tmp_path: Path,
|
|
1388
1432
|
) -> None:
|
|
1389
1433
|
"""Test setup fails if test task fails."""
|
|
1390
1434
|
mock_env_get.return_value = "test-api-key"
|
|
1435
|
+
ssh_key = tmp_path / "test_key"
|
|
1436
|
+
ssh_key.touch()
|
|
1437
|
+
azure_provider.azure_config.ssh_key_path = ssh_key
|
|
1391
1438
|
|
|
1392
1439
|
mock_run.side_effect = [
|
|
1393
|
-
Mock(returncode=0), # ssh-keygen
|
|
1394
1440
|
Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
|
|
1395
1441
|
Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
|
|
1396
1442
|
Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show
|