mcpbr 0.5.2__tar.gz → 0.5.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/marketplace.json +2 -2
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/package.json +1 -1
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/plugin.json +1 -1
- {mcpbr-0.5.2 → mcpbr-0.5.4}/PKG-INFO +1 -1
- {mcpbr-0.5.2 → mcpbr-0.5.4}/package.json +1 -1
- {mcpbr-0.5.2 → mcpbr-0.5.4}/pyproject.toml +1 -1
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/__init__.py +1 -1
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/evaluation.py +75 -54
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/harness.py +145 -3
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/azure.py +4 -3
- mcpbr-0.5.4/tests/test_cold_start.py +130 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_mcp_logging.py +11 -9
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude/settings.json +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/README.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/skills/README.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/skills/benchmark-swe-lite/SKILL.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/skills/mcpbr-config/SKILL.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/skills/mcpbr-eval/SKILL.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/dependabot.yml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/release-drafter.yml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/workflows/ci.yml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/workflows/post-release-bump.yml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/workflows/publish-npm.yml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/workflows/publish.yml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/workflows/release-drafter.yml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.gitignore +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/.pre-commit-config.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/AGENTS.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/CHANGELOG.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/CLAUDE.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/CODE_OF_CONDUCT.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/CONTRIBUTING.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/Dockerfile +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/HUMANEVAL_FIX_SUMMARY.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/LICENSE +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/Makefile +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/PR_SUMMARY.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/README.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/SECURITY.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/assets/mcpbr-demo.gif +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/assets/mcpbr-eval-results.png +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/assets/mcpbr-logo.jpg +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/bin/mcpbr.js +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/config/example.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/config/humaneval.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/config/supermodel.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/azure-config-example.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/custom-benchmark.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/env-vars-example.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/README.md +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/base-config.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/dev-config.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/multi-extend-config.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/production-config.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/shared-mcp-settings.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/local-config-example.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/quick-start/gsm8k-math-reasoning.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/quick-start/test-your-mcp-server.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/install.sh +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/requirements.txt +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/scripts/sync_version.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/scripts/validate_plugin_manifests.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/__main__.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/agent.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/__init__.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/adversarial.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/agentbench.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/aider_polyglot.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/apps.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/arc.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/base.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/bigbench_hard.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/bigcodebench.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/codecontests.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/codereval.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/custom.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/cybergym.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/gaia.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/gsm8k.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/hellaswag.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/humaneval.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/intercode.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/leetcode.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/longbench.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/math_benchmark.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/mbpp.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/mcptoolbench.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/mlagentbench.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/mmmu.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/repoqa.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/swebench.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/terminalbench.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/toolbench.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/truthfulqa.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/webarena.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/cache.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/cli.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/config.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/config_inheritance.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/config_migration.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/config_validator.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/config_wizard.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/custom_metrics.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/dashboard.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/dataset_streaming.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/dataset_versioning.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/docker_cache.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/docker_env.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/docker_prewarm.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/dry_run.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/env_expansion.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/failure_analysis.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/few_shot.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/formatting.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/gpu_support.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/harnesses.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/incremental_save.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/__init__.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/azure_health.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/base.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/local.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/manager.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/junit_reporter.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/latency_metrics.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/log_formatter.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/models.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/output_validator.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/preflight.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/pricing.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/profiler.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/providers.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/regression.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/reporting.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/resource_limits.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/result_streaming.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/sampling.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/schema.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/smoke_test.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/state_tracker.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/statistics.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/streaming.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/swebench_test_specs.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/task_batching.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/task_scheduler.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/templates.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/__init__.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/__init__.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_azure.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_azure_health.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_base.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_cli_infrastructure.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_config.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_local.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_manager.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_adversarial_benchmark.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_agent.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_benchmark_filtering.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_benchmark_integration.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_benchmarks.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_build_test_command.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_cache.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_claude_plugin.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_cli_templates.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_comparison_aggregation.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_comparison_config.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_comparison_integration.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_comparison_reporting.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_env_vars.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_inheritance.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_migration.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_validator.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_validator_inheritance.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_wizard.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_cost_calculation.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_custom_benchmark.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_custom_metrics.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_dashboard.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_dataset_streaming.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_dataset_versioning.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_default_logging.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_docker_cache.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_docker_cleanup.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_docker_label_fix.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_docker_prewarm.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_docker_retry.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_dry_run.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_env_expansion.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_error_messages.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_eval_reliability.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_evaluation.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_exit_codes.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_export.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_failure_analysis.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_few_shot.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_formatting.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_git_diff_new_files.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_gpu_support.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_incremental_save.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_integration.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_junit_reporter.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_latency_metrics.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_log_formatter_read_tool.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_longbench_benchmark.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_mcp_health_check.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_mcptoolbench_benchmark.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_mmmu_benchmark.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_models.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_output_validator.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_parse_errors.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_preflight.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_pricing.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_profiler.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_regression.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_reporting.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_resource_limits.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_result_streaming.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_runtime_tracking.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_sampling.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_schema.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_setup_command_fixes.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_smoke_test.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_state_tracker.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_statistics.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_statistics_integration.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_streaming.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_string_concat_bug.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_task_batching.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_task_scheduler.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_templates.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_thinking_budget.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_timeout_tracking.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_tool_failure_tracking.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_trial_mode.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_type_safety.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_xml_export.py +0 -0
- {mcpbr-0.5.2 → mcpbr-0.5.4}/uv.lock +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "https://anthropic.com/claude-code/marketplace.schema.json",
|
|
3
3
|
"name": "mcpbr",
|
|
4
|
-
"version": "0.5.
|
|
4
|
+
"version": "0.5.4",
|
|
5
5
|
"description": "mcpbr - MCP Benchmark Runner plugin marketplace",
|
|
6
6
|
"owner": {
|
|
7
7
|
"name": "mcpbr Contributors",
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
{
|
|
12
12
|
"name": "mcpbr",
|
|
13
13
|
"description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
|
|
14
|
-
"version": "0.5.
|
|
14
|
+
"version": "0.5.4",
|
|
15
15
|
"author": {
|
|
16
16
|
"name": "mcpbr Contributors"
|
|
17
17
|
},
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcpbr
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.4
|
|
4
4
|
Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
|
|
5
5
|
Project-URL: Homepage, https://github.com/greynewell/mcpbr
|
|
6
6
|
Project-URL: Repository, https://github.com/greynewell/mcpbr
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mcpbr"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.4"
|
|
8
8
|
description = "Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -93,43 +93,52 @@ async def apply_patch(
|
|
|
93
93
|
|
|
94
94
|
workdir = workdir or env.workdir
|
|
95
95
|
|
|
96
|
-
#
|
|
97
|
-
#
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
96
|
+
# Use longer timeouts for git operations — under concurrent load,
|
|
97
|
+
# Docker exec can be slow and 30s is insufficient (#399).
|
|
98
|
+
try:
|
|
99
|
+
# Reset repository to clean state before applying patch
|
|
100
|
+
# The agent modified files directly, so we need to restore HEAD state
|
|
101
|
+
await env.exec_command("git reset --hard HEAD", timeout=120, workdir=workdir)
|
|
102
|
+
await env.exec_command("git clean -fd", timeout=120, workdir=workdir)
|
|
102
103
|
|
|
103
|
-
|
|
104
|
-
"git apply --check fix.patch",
|
|
105
|
-
timeout=30,
|
|
106
|
-
workdir=workdir,
|
|
107
|
-
)
|
|
104
|
+
await env.write_file("fix.patch", patch, workdir=workdir)
|
|
108
105
|
|
|
109
|
-
if exit_code != 0:
|
|
110
|
-
exit_code2, stdout2, stderr2 = await env.exec_command(
|
|
111
|
-
"git apply --check -3 fix.patch",
|
|
112
|
-
timeout=30,
|
|
113
|
-
workdir=workdir,
|
|
114
|
-
)
|
|
115
|
-
if exit_code2 != 0:
|
|
116
|
-
return False, f"Patch does not apply: {stderr or stderr2}"
|
|
117
106
|
exit_code, stdout, stderr = await env.exec_command(
|
|
118
|
-
"git apply
|
|
119
|
-
timeout=
|
|
120
|
-
workdir=workdir,
|
|
121
|
-
)
|
|
122
|
-
else:
|
|
123
|
-
exit_code, stdout, stderr = await env.exec_command(
|
|
124
|
-
"git apply fix.patch",
|
|
125
|
-
timeout=30,
|
|
107
|
+
"git apply --check fix.patch",
|
|
108
|
+
timeout=120,
|
|
126
109
|
workdir=workdir,
|
|
127
110
|
)
|
|
128
111
|
|
|
129
|
-
|
|
130
|
-
|
|
112
|
+
if exit_code != 0:
|
|
113
|
+
exit_code2, stdout2, stderr2 = await env.exec_command(
|
|
114
|
+
"git apply --check -3 fix.patch",
|
|
115
|
+
timeout=120,
|
|
116
|
+
workdir=workdir,
|
|
117
|
+
)
|
|
118
|
+
if exit_code2 != 0:
|
|
119
|
+
return False, f"Patch does not apply: {stderr or stderr2}"
|
|
120
|
+
exit_code, stdout, stderr = await env.exec_command(
|
|
121
|
+
"git apply -3 fix.patch",
|
|
122
|
+
timeout=120,
|
|
123
|
+
workdir=workdir,
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
exit_code, stdout, stderr = await env.exec_command(
|
|
127
|
+
"git apply fix.patch",
|
|
128
|
+
timeout=120,
|
|
129
|
+
workdir=workdir,
|
|
130
|
+
)
|
|
131
131
|
|
|
132
|
-
|
|
132
|
+
if exit_code != 0:
|
|
133
|
+
return False, f"Failed to apply patch: {stderr}"
|
|
134
|
+
|
|
135
|
+
return True, ""
|
|
136
|
+
|
|
137
|
+
except (TimeoutError, asyncio.TimeoutError):
|
|
138
|
+
# Catch exec_command timeouts here so they don't bubble up as
|
|
139
|
+
# asyncio.TimeoutError to the harness, which would misclassify
|
|
140
|
+
# this as an agent/eval timeout (#399).
|
|
141
|
+
return False, "Docker exec timed out during patch application"
|
|
133
142
|
|
|
134
143
|
|
|
135
144
|
async def run_tests(
|
|
@@ -282,38 +291,43 @@ async def _apply_test_patch(
|
|
|
282
291
|
|
|
283
292
|
workdir = workdir or env.workdir
|
|
284
293
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
exit_code, stdout, stderr = await env.exec_command(
|
|
288
|
-
"git apply --check test.patch",
|
|
289
|
-
timeout=30,
|
|
290
|
-
workdir=workdir,
|
|
291
|
-
)
|
|
294
|
+
try:
|
|
295
|
+
await env.write_file("test.patch", test_patch, workdir=workdir)
|
|
292
296
|
|
|
293
|
-
if exit_code != 0:
|
|
294
297
|
exit_code, stdout, stderr = await env.exec_command(
|
|
295
|
-
"git apply --check
|
|
296
|
-
timeout=
|
|
298
|
+
"git apply --check test.patch",
|
|
299
|
+
timeout=120,
|
|
297
300
|
workdir=workdir,
|
|
298
301
|
)
|
|
302
|
+
|
|
303
|
+
if exit_code != 0:
|
|
304
|
+
exit_code, stdout, stderr = await env.exec_command(
|
|
305
|
+
"git apply --check -3 test.patch",
|
|
306
|
+
timeout=120,
|
|
307
|
+
workdir=workdir,
|
|
308
|
+
)
|
|
309
|
+
if exit_code != 0:
|
|
310
|
+
return True, ""
|
|
311
|
+
exit_code, stdout, stderr = await env.exec_command(
|
|
312
|
+
"git apply -3 test.patch",
|
|
313
|
+
timeout=120,
|
|
314
|
+
workdir=workdir,
|
|
315
|
+
)
|
|
316
|
+
else:
|
|
317
|
+
exit_code, stdout, stderr = await env.exec_command(
|
|
318
|
+
"git apply test.patch",
|
|
319
|
+
timeout=120,
|
|
320
|
+
workdir=workdir,
|
|
321
|
+
)
|
|
322
|
+
|
|
299
323
|
if exit_code != 0:
|
|
300
324
|
return True, ""
|
|
301
|
-
exit_code, stdout, stderr = await env.exec_command(
|
|
302
|
-
"git apply -3 test.patch",
|
|
303
|
-
timeout=30,
|
|
304
|
-
workdir=workdir,
|
|
305
|
-
)
|
|
306
|
-
else:
|
|
307
|
-
exit_code, stdout, stderr = await env.exec_command(
|
|
308
|
-
"git apply test.patch",
|
|
309
|
-
timeout=30,
|
|
310
|
-
workdir=workdir,
|
|
311
|
-
)
|
|
312
325
|
|
|
313
|
-
if exit_code != 0:
|
|
314
326
|
return True, ""
|
|
315
327
|
|
|
316
|
-
|
|
328
|
+
except (TimeoutError, asyncio.TimeoutError):
|
|
329
|
+
# Don't let exec timeouts bubble up to the harness (#399)
|
|
330
|
+
return True, ""
|
|
317
331
|
|
|
318
332
|
|
|
319
333
|
async def evaluate_patch(
|
|
@@ -356,7 +370,14 @@ async def evaluate_patch(
|
|
|
356
370
|
|
|
357
371
|
# Skip dependency installation for pre-built images (already done)
|
|
358
372
|
if not env.uses_prebuilt:
|
|
359
|
-
|
|
373
|
+
try:
|
|
374
|
+
await _install_dependencies(env)
|
|
375
|
+
except (TimeoutError, asyncio.TimeoutError):
|
|
376
|
+
return EvaluationResult(
|
|
377
|
+
resolved=False,
|
|
378
|
+
patch_applied=True,
|
|
379
|
+
error="Docker exec timed out during dependency installation",
|
|
380
|
+
)
|
|
360
381
|
|
|
361
382
|
repo = task.get("repo")
|
|
362
383
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Main evaluation harness orchestrating parallel task execution."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import logging
|
|
4
5
|
import time
|
|
5
6
|
from dataclasses import dataclass
|
|
6
7
|
from datetime import datetime, timezone
|
|
@@ -29,6 +30,7 @@ from .pricing import calculate_cost
|
|
|
29
30
|
from .profiler import PerformanceProfiler
|
|
30
31
|
|
|
31
32
|
console = Console()
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
32
34
|
|
|
33
35
|
|
|
34
36
|
class SimpleNamespace:
|
|
@@ -56,6 +58,57 @@ def dict_to_namespace(data: Any) -> Any:
|
|
|
56
58
|
return data
|
|
57
59
|
|
|
58
60
|
|
|
61
|
+
# -- Cold-start mitigation helpers (#401) ------------------------------------
|
|
62
|
+
|
|
63
|
+
# Seconds between each task launch in the first concurrent batch.
|
|
64
|
+
_STAGGER_INTERVAL = 1.0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _stagger_delay(task_index: int, max_concurrent: int) -> float:
|
|
68
|
+
"""Return the startup delay for a task to avoid cold-start contention.
|
|
69
|
+
|
|
70
|
+
Only the first batch (indices 0 .. max_concurrent-1) is staggered.
|
|
71
|
+
The very first task starts immediately; subsequent tasks in the batch
|
|
72
|
+
get an increasing delay so Docker image pulls and container creation
|
|
73
|
+
don't all hit at once.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
task_index: Zero-based index of the task in launch order.
|
|
77
|
+
max_concurrent: Semaphore size / max parallelism.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Delay in seconds (0.0 means start immediately).
|
|
81
|
+
"""
|
|
82
|
+
if max_concurrent <= 1:
|
|
83
|
+
return 0.0
|
|
84
|
+
# Only stagger the first batch
|
|
85
|
+
if task_index >= max_concurrent:
|
|
86
|
+
return 0.0
|
|
87
|
+
return task_index * _STAGGER_INTERVAL
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _should_retry_zero_iteration(result: dict[str, Any]) -> bool:
|
|
91
|
+
"""Check whether a task result indicates a cold-start failure worth retrying.
|
|
92
|
+
|
|
93
|
+
A cold-start failure is characterised by zero iterations AND zero tokens
|
|
94
|
+
AND a timeout status — the agent never actually ran.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
result: Single-run result dict from _run_mcp_evaluation or _run_baseline_evaluation.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
True if the result looks like a cold-start failure.
|
|
101
|
+
"""
|
|
102
|
+
if result.get("status") != "timeout":
|
|
103
|
+
return False
|
|
104
|
+
if result.get("iterations", -1) != 0:
|
|
105
|
+
return False
|
|
106
|
+
tokens = result.get("tokens", {})
|
|
107
|
+
if tokens.get("input", -1) != 0 or tokens.get("output", -1) != 0:
|
|
108
|
+
return False
|
|
109
|
+
return True
|
|
110
|
+
|
|
111
|
+
|
|
59
112
|
@dataclass
|
|
60
113
|
class TaskResult:
|
|
61
114
|
"""Result for a single task."""
|
|
@@ -302,6 +355,24 @@ async def run_single_task(
|
|
|
302
355
|
mcp_server_config=config.mcp_server_a,
|
|
303
356
|
server_name="server_a",
|
|
304
357
|
)
|
|
358
|
+
# Retry once on cold-start failure (#401)
|
|
359
|
+
if result.mcp_server_a and _should_retry_zero_iteration(result.mcp_server_a):
|
|
360
|
+
logger.info(
|
|
361
|
+
"Retrying MCP server_a task %s (zero-iteration cold-start)", instance_id
|
|
362
|
+
)
|
|
363
|
+
result.mcp_server_a = await _run_mcp_evaluation(
|
|
364
|
+
task,
|
|
365
|
+
config,
|
|
366
|
+
docker_manager,
|
|
367
|
+
benchmark,
|
|
368
|
+
verbose,
|
|
369
|
+
verbosity,
|
|
370
|
+
mcp_log_writer_a if mcp_log_writer_a else log_file,
|
|
371
|
+
cache,
|
|
372
|
+
mcp_logs_dir,
|
|
373
|
+
mcp_server_config=config.mcp_server_a,
|
|
374
|
+
server_name="server_a",
|
|
375
|
+
)
|
|
305
376
|
finally:
|
|
306
377
|
if mcp_log_writer_a:
|
|
307
378
|
mcp_log_writer_a.close()
|
|
@@ -324,6 +395,24 @@ async def run_single_task(
|
|
|
324
395
|
mcp_server_config=config.mcp_server_b,
|
|
325
396
|
server_name="server_b",
|
|
326
397
|
)
|
|
398
|
+
# Retry once on cold-start failure (#401)
|
|
399
|
+
if result.mcp_server_b and _should_retry_zero_iteration(result.mcp_server_b):
|
|
400
|
+
logger.info(
|
|
401
|
+
"Retrying MCP server_b task %s (zero-iteration cold-start)", instance_id
|
|
402
|
+
)
|
|
403
|
+
result.mcp_server_b = await _run_mcp_evaluation(
|
|
404
|
+
task,
|
|
405
|
+
config,
|
|
406
|
+
docker_manager,
|
|
407
|
+
benchmark,
|
|
408
|
+
verbose,
|
|
409
|
+
verbosity,
|
|
410
|
+
mcp_log_writer_b if mcp_log_writer_b else log_file,
|
|
411
|
+
cache,
|
|
412
|
+
mcp_logs_dir,
|
|
413
|
+
mcp_server_config=config.mcp_server_b,
|
|
414
|
+
server_name="server_b",
|
|
415
|
+
)
|
|
327
416
|
finally:
|
|
328
417
|
if mcp_log_writer_b:
|
|
329
418
|
mcp_log_writer_b.close()
|
|
@@ -344,6 +433,20 @@ async def run_single_task(
|
|
|
344
433
|
cache,
|
|
345
434
|
mcp_logs_dir,
|
|
346
435
|
)
|
|
436
|
+
# Retry once on cold-start failure (#401)
|
|
437
|
+
if result.mcp and _should_retry_zero_iteration(result.mcp):
|
|
438
|
+
logger.info("Retrying MCP task %s (zero-iteration cold-start)", instance_id)
|
|
439
|
+
result.mcp = await _run_mcp_evaluation(
|
|
440
|
+
task,
|
|
441
|
+
config,
|
|
442
|
+
docker_manager,
|
|
443
|
+
benchmark,
|
|
444
|
+
verbose,
|
|
445
|
+
verbosity,
|
|
446
|
+
mcp_log_writer if mcp_log_writer else log_file,
|
|
447
|
+
cache,
|
|
448
|
+
mcp_logs_dir,
|
|
449
|
+
)
|
|
347
450
|
finally:
|
|
348
451
|
if mcp_log_writer:
|
|
349
452
|
mcp_log_writer.close()
|
|
@@ -363,6 +466,19 @@ async def run_single_task(
|
|
|
363
466
|
baseline_log_writer if baseline_log_writer else log_file,
|
|
364
467
|
cache,
|
|
365
468
|
)
|
|
469
|
+
# Retry once on cold-start failure (#401)
|
|
470
|
+
if result.baseline and _should_retry_zero_iteration(result.baseline):
|
|
471
|
+
logger.info("Retrying baseline task %s (zero-iteration cold-start)", instance_id)
|
|
472
|
+
result.baseline = await _run_baseline_evaluation(
|
|
473
|
+
task,
|
|
474
|
+
config,
|
|
475
|
+
docker_manager,
|
|
476
|
+
benchmark,
|
|
477
|
+
verbose,
|
|
478
|
+
verbosity,
|
|
479
|
+
baseline_log_writer if baseline_log_writer else log_file,
|
|
480
|
+
cache,
|
|
481
|
+
)
|
|
366
482
|
finally:
|
|
367
483
|
if baseline_log_writer:
|
|
368
484
|
baseline_log_writer.close()
|
|
@@ -539,7 +655,15 @@ async def _run_mcp_evaluation(
|
|
|
539
655
|
if env:
|
|
540
656
|
# Track Docker teardown time
|
|
541
657
|
teardown_start = time.time()
|
|
542
|
-
|
|
658
|
+
try:
|
|
659
|
+
await asyncio.wait_for(env.cleanup(), timeout=60)
|
|
660
|
+
except (asyncio.TimeoutError, Exception) as cleanup_err:
|
|
661
|
+
logger.warning("Container cleanup failed for MCP task: %s", cleanup_err)
|
|
662
|
+
try:
|
|
663
|
+
if hasattr(env, "container") and env.container:
|
|
664
|
+
env.container.remove(force=True)
|
|
665
|
+
except Exception:
|
|
666
|
+
pass
|
|
543
667
|
if profiler:
|
|
544
668
|
teardown_end = time.time()
|
|
545
669
|
profiler.record_docker_teardown(teardown_end - teardown_start)
|
|
@@ -695,7 +819,15 @@ async def _run_baseline_evaluation(
|
|
|
695
819
|
if env:
|
|
696
820
|
# Track Docker teardown time
|
|
697
821
|
teardown_start = time.time()
|
|
698
|
-
|
|
822
|
+
try:
|
|
823
|
+
await asyncio.wait_for(env.cleanup(), timeout=60)
|
|
824
|
+
except (asyncio.TimeoutError, Exception) as cleanup_err:
|
|
825
|
+
logger.warning("Container cleanup failed for baseline task: %s", cleanup_err)
|
|
826
|
+
try:
|
|
827
|
+
if hasattr(env, "container") and env.container:
|
|
828
|
+
env.container.remove(force=True)
|
|
829
|
+
except Exception:
|
|
830
|
+
pass
|
|
699
831
|
if profiler:
|
|
700
832
|
teardown_end = time.time()
|
|
701
833
|
profiler.record_docker_teardown(teardown_end - teardown_start)
|
|
@@ -1013,9 +1145,10 @@ async def run_evaluation(
|
|
|
1013
1145
|
semaphore = asyncio.Semaphore(config.max_concurrent)
|
|
1014
1146
|
budget_exceeded = False
|
|
1015
1147
|
current_cost = 0.0
|
|
1148
|
+
_task_launch_counter = 0
|
|
1016
1149
|
|
|
1017
1150
|
async def run_with_semaphore(task: dict[str, Any]) -> TaskResult | None:
|
|
1018
|
-
nonlocal current_cost, budget_exceeded
|
|
1151
|
+
nonlocal current_cost, budget_exceeded, _task_launch_counter
|
|
1019
1152
|
|
|
1020
1153
|
# Check budget before running task
|
|
1021
1154
|
if config.budget and current_cost >= config.budget:
|
|
@@ -1023,6 +1156,15 @@ async def run_evaluation(
|
|
|
1023
1156
|
return None
|
|
1024
1157
|
|
|
1025
1158
|
async with semaphore:
|
|
1159
|
+
# Stagger first-batch launches to avoid cold-start contention (#401).
|
|
1160
|
+
# Delay is inside the semaphore so the sleeping task holds its slot
|
|
1161
|
+
# and later tasks cannot leapfrog ahead of the first batch.
|
|
1162
|
+
my_index = _task_launch_counter
|
|
1163
|
+
_task_launch_counter += 1
|
|
1164
|
+
delay = _stagger_delay(my_index, config.max_concurrent)
|
|
1165
|
+
if delay > 0:
|
|
1166
|
+
await asyncio.sleep(delay)
|
|
1167
|
+
|
|
1026
1168
|
result = await run_single_task(
|
|
1027
1169
|
task,
|
|
1028
1170
|
config,
|
|
@@ -17,6 +17,7 @@ except ImportError:
|
|
|
17
17
|
|
|
18
18
|
from rich.console import Console
|
|
19
19
|
|
|
20
|
+
from .. import __version__
|
|
20
21
|
from ..config import HarnessConfig
|
|
21
22
|
from .base import InfrastructureProvider
|
|
22
23
|
|
|
@@ -342,9 +343,9 @@ class AzureProvider(InfrastructureProvider):
|
|
|
342
343
|
else:
|
|
343
344
|
console.print("[green]✓ Node.js installed[/green]")
|
|
344
345
|
|
|
345
|
-
# Step 4: Install mcpbr
|
|
346
|
-
console.print("[cyan]Installing mcpbr...[/cyan]")
|
|
347
|
-
step4_cmd = f"python{py_ver} -m pip install mcpbr"
|
|
346
|
+
# Step 4: Install mcpbr (pin to local version)
|
|
347
|
+
console.print(f"[cyan]Installing mcpbr=={__version__}...[/cyan]")
|
|
348
|
+
step4_cmd = f"python{py_ver} -m pip install mcpbr=={__version__}"
|
|
348
349
|
exit_code, _stdout, stderr = await self._ssh_exec(step4_cmd, timeout=300)
|
|
349
350
|
if exit_code != 0:
|
|
350
351
|
console.print(f"[yellow]⚠ mcpbr install issues: {stderr[:300]}[/yellow]")
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Tests for cold-start staggering and zero-iteration retry logic."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from mcpbr.harness import TaskResult, _should_retry_zero_iteration, _stagger_delay
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestStaggeredStarts:
|
|
11
|
+
"""Verify that concurrent task launches are staggered to avoid cold-start failures."""
|
|
12
|
+
|
|
13
|
+
@pytest.mark.asyncio
|
|
14
|
+
async def test_tasks_are_staggered(self) -> None:
|
|
15
|
+
"""First-batch tasks should not all start at the same instant.
|
|
16
|
+
|
|
17
|
+
When max_concurrent > 1, the semaphore wrapper should insert a small
|
|
18
|
+
delay between task launches so Docker isn't overwhelmed by simultaneous
|
|
19
|
+
image pulls and container startups.
|
|
20
|
+
"""
|
|
21
|
+
launch_times: list[float] = []
|
|
22
|
+
loop = asyncio.get_running_loop()
|
|
23
|
+
|
|
24
|
+
async def fake_run_single_task(task):
|
|
25
|
+
launch_times.append(loop.time())
|
|
26
|
+
await asyncio.sleep(0.05) # Simulate brief work
|
|
27
|
+
return TaskResult(instance_id=f"task-{len(launch_times)}")
|
|
28
|
+
|
|
29
|
+
tasks = [{"instance_id": f"task-{i}"} for i in range(5)]
|
|
30
|
+
max_concurrent = 5 # All 5 could start at once without staggering
|
|
31
|
+
|
|
32
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
33
|
+
task_counter = 0
|
|
34
|
+
|
|
35
|
+
async def run_with_semaphore(task):
|
|
36
|
+
nonlocal task_counter
|
|
37
|
+
async with semaphore:
|
|
38
|
+
my_index = task_counter
|
|
39
|
+
task_counter += 1
|
|
40
|
+
delay = _stagger_delay(my_index, max_concurrent)
|
|
41
|
+
if delay > 0:
|
|
42
|
+
await asyncio.sleep(delay)
|
|
43
|
+
return await fake_run_single_task(task)
|
|
44
|
+
|
|
45
|
+
async_tasks = [asyncio.create_task(run_with_semaphore(t)) for t in tasks]
|
|
46
|
+
await asyncio.gather(*async_tasks)
|
|
47
|
+
|
|
48
|
+
assert len(launch_times) == 5
|
|
49
|
+
|
|
50
|
+
# The first and last task should be separated by at least some delay
|
|
51
|
+
spread = launch_times[-1] - launch_times[0]
|
|
52
|
+
assert spread > 0.1, (
|
|
53
|
+
f"Tasks launched with only {spread:.3f}s spread — expected staggering to space them out"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
@pytest.mark.asyncio
|
|
57
|
+
async def test_stagger_delay_values(self) -> None:
|
|
58
|
+
"""_stagger_delay should return increasing delays for the first batch."""
|
|
59
|
+
# First task: no delay
|
|
60
|
+
assert _stagger_delay(0, max_concurrent=5) == 0.0
|
|
61
|
+
|
|
62
|
+
# Subsequent first-batch tasks: increasing delay
|
|
63
|
+
d1 = _stagger_delay(1, max_concurrent=5)
|
|
64
|
+
d2 = _stagger_delay(2, max_concurrent=5)
|
|
65
|
+
assert d1 > 0
|
|
66
|
+
assert d2 > d1
|
|
67
|
+
|
|
68
|
+
# Tasks beyond the first batch: no delay
|
|
69
|
+
assert _stagger_delay(5, max_concurrent=5) == 0.0
|
|
70
|
+
assert _stagger_delay(10, max_concurrent=5) == 0.0
|
|
71
|
+
|
|
72
|
+
@pytest.mark.asyncio
|
|
73
|
+
async def test_stagger_delay_single_concurrent(self) -> None:
|
|
74
|
+
"""With max_concurrent=1, no staggering is needed."""
|
|
75
|
+
assert _stagger_delay(0, max_concurrent=1) == 0.0
|
|
76
|
+
assert _stagger_delay(1, max_concurrent=1) == 0.0
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class TestZeroIterationRetry:
|
|
80
|
+
"""Verify that _should_retry_zero_iteration detects cold-start failures."""
|
|
81
|
+
|
|
82
|
+
@pytest.mark.asyncio
|
|
83
|
+
async def test_detects_cold_start_failure(self) -> None:
|
|
84
|
+
"""Zero iterations + zero tokens + timeout = cold-start failure."""
|
|
85
|
+
zero_iter_result = {
|
|
86
|
+
"resolved": False,
|
|
87
|
+
"patch_applied": False,
|
|
88
|
+
"status": "timeout",
|
|
89
|
+
"error": "Timeout",
|
|
90
|
+
"tokens": {"input": 0, "output": 0},
|
|
91
|
+
"iterations": 0,
|
|
92
|
+
"tool_calls": 0,
|
|
93
|
+
"cost": 0.0,
|
|
94
|
+
"runtime_seconds": 236.0,
|
|
95
|
+
}
|
|
96
|
+
assert _should_retry_zero_iteration(zero_iter_result) is True
|
|
97
|
+
|
|
98
|
+
@pytest.mark.asyncio
|
|
99
|
+
async def test_completed_task_not_retried(self) -> None:
|
|
100
|
+
"""A task that completed successfully should never be retried."""
|
|
101
|
+
good_result = {
|
|
102
|
+
"resolved": True,
|
|
103
|
+
"status": "completed",
|
|
104
|
+
"iterations": 20,
|
|
105
|
+
"tokens": {"input": 10000, "output": 5000},
|
|
106
|
+
}
|
|
107
|
+
assert _should_retry_zero_iteration(good_result) is False
|
|
108
|
+
|
|
109
|
+
@pytest.mark.asyncio
|
|
110
|
+
async def test_nonzero_iteration_timeout_not_retried(self) -> None:
|
|
111
|
+
"""A timeout with real iterations is a genuine timeout, not cold-start."""
|
|
112
|
+
real_timeout = {
|
|
113
|
+
"resolved": False,
|
|
114
|
+
"status": "timeout",
|
|
115
|
+
"iterations": 5,
|
|
116
|
+
"tokens": {"input": 3000, "output": 1500},
|
|
117
|
+
}
|
|
118
|
+
assert _should_retry_zero_iteration(real_timeout) is False
|
|
119
|
+
|
|
120
|
+
@pytest.mark.asyncio
|
|
121
|
+
async def test_non_timeout_error_not_retried(self) -> None:
|
|
122
|
+
"""Zero iterations from a non-timeout error should not trigger retry."""
|
|
123
|
+
error_result = {
|
|
124
|
+
"resolved": False,
|
|
125
|
+
"status": "error",
|
|
126
|
+
"error": "Something broke",
|
|
127
|
+
"iterations": 0,
|
|
128
|
+
"tokens": {"input": 0, "output": 0},
|
|
129
|
+
}
|
|
130
|
+
assert _should_retry_zero_iteration(error_result) is False
|
|
@@ -61,7 +61,8 @@ class TestMCPLogging:
|
|
|
61
61
|
1,
|
|
62
62
|
"",
|
|
63
63
|
"npx: command not found",
|
|
64
|
-
), #
|
|
64
|
+
), # .mcp.json write fails
|
|
65
|
+
(0, "", ""), # chown .mcp.json
|
|
65
66
|
(0, "", ""), # cleanup temp files
|
|
66
67
|
]
|
|
67
68
|
|
|
@@ -87,7 +88,7 @@ class TestMCPLogging:
|
|
|
87
88
|
|
|
88
89
|
# Verify registration failure was caught
|
|
89
90
|
assert result.success is False
|
|
90
|
-
assert "MCP
|
|
91
|
+
assert "MCP config write failed" in result.error
|
|
91
92
|
assert "npx: command not found" in result.error
|
|
92
93
|
|
|
93
94
|
# Verify cleanup was called
|
|
@@ -110,6 +111,7 @@ class TestMCPLogging:
|
|
|
110
111
|
(0, "", ""), # env file write
|
|
111
112
|
(0, "", ""), # chown env
|
|
112
113
|
(1, "Server starting...\nInitialization failed", "Error: Missing API key"),
|
|
114
|
+
(0, "", ""), # chown .mcp.json
|
|
113
115
|
(0, "", ""), # cleanup
|
|
114
116
|
]
|
|
115
117
|
|
|
@@ -133,10 +135,10 @@ class TestMCPLogging:
|
|
|
133
135
|
task_id="test_id",
|
|
134
136
|
)
|
|
135
137
|
|
|
136
|
-
# Verify
|
|
138
|
+
# Verify stderr is in error message and stdout is captured separately
|
|
137
139
|
assert "Error: Missing API key" in result.error
|
|
138
|
-
assert "Server starting" in result.error or "Initialization failed" in result.error
|
|
139
140
|
assert result.stdout is not None
|
|
141
|
+
assert "Server starting" in result.stdout or "Initialization failed" in result.stdout
|
|
140
142
|
|
|
141
143
|
@pytest.mark.asyncio
|
|
142
144
|
async def test_mcp_timeout_cleanup(self, harness: ClaudeCodeHarness) -> None:
|
|
@@ -177,8 +179,7 @@ class TestMCPLogging:
|
|
|
177
179
|
|
|
178
180
|
# Verify timeout was caught
|
|
179
181
|
assert result.success is False
|
|
180
|
-
assert "
|
|
181
|
-
assert "failed to start or is hanging" in result.error
|
|
182
|
+
assert "Failed to write MCP configuration file" in result.error
|
|
182
183
|
|
|
183
184
|
# Verify cleanup was called
|
|
184
185
|
cleanup_calls = [
|
|
@@ -255,9 +256,10 @@ Debug: Cache miss for /workspace/"""
|
|
|
255
256
|
(0, "", ""), # chown prompt
|
|
256
257
|
(0, "", ""), # env file write
|
|
257
258
|
(0, "", ""), # chown env
|
|
258
|
-
(0, "MCP server registered successfully", ""), #
|
|
259
|
-
(0, "", ""), #
|
|
260
|
-
(0, "", ""), # rm
|
|
259
|
+
(0, "MCP server registered successfully", ""), # .mcp.json write
|
|
260
|
+
(0, "", ""), # chown .mcp.json
|
|
261
|
+
(0, "", ""), # rm .mcp.json (exit_code != 0 path)
|
|
262
|
+
(0, "", ""), # rm temp files (finally cleanup)
|
|
261
263
|
]
|
|
262
264
|
|
|
263
265
|
# Mock streaming execution with our test output
|
|
File without changes
|
|
File without changes
|