mcpbr 0.4.16__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/marketplace.json +2 -2
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/package.json +1 -1
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/plugin.json +1 -1
- mcpbr-0.6.0/.dockerignore +55 -0
- mcpbr-0.6.0/.gitattributes +2 -0
- mcpbr-0.6.0/.github/workflows/build-binaries.yml +83 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.gitignore +1 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.pre-commit-config.yaml +1 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/CHANGELOG.md +55 -0
- mcpbr-0.6.0/Dockerfile.app +61 -0
- mcpbr-0.6.0/Formula/mcpbr.rb +51 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/PKG-INFO +8 -1
- mcpbr-0.6.0/action/README.md +46 -0
- mcpbr-0.6.0/action/action.yml +90 -0
- mcpbr-0.6.0/action/examples/basic.yml +25 -0
- mcpbr-0.6.0/action/examples/matrix.yml +62 -0
- mcpbr-0.6.0/ci-templates/circleci/README.md +56 -0
- mcpbr-0.6.0/ci-templates/circleci/orb.yml +134 -0
- mcpbr-0.6.0/ci-templates/gitlab/.gitlab-ci-mcpbr.yml +69 -0
- mcpbr-0.6.0/ci-templates/gitlab/README.md +44 -0
- mcpbr-0.6.0/conda/README.md +54 -0
- mcpbr-0.6.0/conda/bld.bat +3 -0
- mcpbr-0.6.0/conda/build.sh +5 -0
- mcpbr-0.6.0/conda/meta.yaml +59 -0
- mcpbr-0.6.0/docker/README.md +59 -0
- mcpbr-0.6.0/docker/docker-compose.yml +29 -0
- mcpbr-0.6.0/docker/entrypoint.sh +16 -0
- mcpbr-0.6.0/homebrew/README.md +39 -0
- mcpbr-0.6.0/mcpbr.spec +83 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/package.json +1 -1
- {mcpbr-0.4.16 → mcpbr-0.6.0}/pyproject.toml +4 -1
- mcpbr-0.6.0/src/mcpbr/__init__.py +25 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/config.py +37 -1
- mcpbr-0.6.0/src/mcpbr/config_migration.py +470 -0
- mcpbr-0.6.0/src/mcpbr/config_wizard.py +647 -0
- mcpbr-0.6.0/src/mcpbr/dashboard.py +619 -0
- mcpbr-0.6.0/src/mcpbr/dataset_streaming.py +491 -0
- mcpbr-0.6.0/src/mcpbr/docker_cache.py +539 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/docker_env.py +2 -1
- mcpbr-0.6.0/src/mcpbr/docker_prewarm.py +370 -0
- mcpbr-0.6.0/src/mcpbr/dry_run.py +533 -0
- mcpbr-0.6.0/src/mcpbr/formatting.py +444 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/gpu_support.py +2 -1
- mcpbr-0.6.0/src/mcpbr/graceful_degradation.py +277 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/harness.py +38 -4
- mcpbr-0.6.0/src/mcpbr/languages.py +228 -0
- mcpbr-0.6.0/src/mcpbr/logging_config.py +207 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/models.py +66 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/preflight.py +2 -1
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/pricing.py +72 -0
- mcpbr-0.6.0/src/mcpbr/providers.py +549 -0
- mcpbr-0.6.0/src/mcpbr/resource_limits.py +487 -0
- mcpbr-0.6.0/src/mcpbr/result_streaming.py +519 -0
- mcpbr-0.6.0/src/mcpbr/sdk.py +264 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/smoke_test.py +2 -1
- mcpbr-0.6.0/src/mcpbr/task_batching.py +403 -0
- mcpbr-0.6.0/src/mcpbr/task_scheduler.py +468 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_adversarial_benchmark.py +4 -10
- mcpbr-0.6.0/tests/test_config_migration.py +971 -0
- mcpbr-0.6.0/tests/test_config_wizard.py +1142 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_custom_benchmark.py +11 -31
- mcpbr-0.6.0/tests/test_dashboard.py +748 -0
- mcpbr-0.6.0/tests/test_dataset_streaming.py +837 -0
- mcpbr-0.6.0/tests/test_docker_cache.py +898 -0
- mcpbr-0.6.0/tests/test_docker_prewarm.py +672 -0
- mcpbr-0.6.0/tests/test_dry_run.py +833 -0
- mcpbr-0.6.0/tests/test_formatting.py +703 -0
- mcpbr-0.6.0/tests/test_graceful_degradation.py +621 -0
- mcpbr-0.6.0/tests/test_logging_config.py +595 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_models.py +2 -2
- mcpbr-0.6.0/tests/test_multi_language.py +454 -0
- mcpbr-0.6.0/tests/test_multi_provider.py +625 -0
- mcpbr-0.6.0/tests/test_platform_files.py +706 -0
- mcpbr-0.6.0/tests/test_resource_limits.py +859 -0
- mcpbr-0.6.0/tests/test_result_streaming.py +834 -0
- mcpbr-0.6.0/tests/test_sdk.py +515 -0
- mcpbr-0.6.0/tests/test_task_batching.py +819 -0
- mcpbr-0.6.0/tests/test_task_scheduler.py +643 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/uv.lock +305 -5
- mcpbr-0.4.16/src/mcpbr/__init__.py +0 -6
- mcpbr-0.4.16/src/mcpbr/providers.py +0 -236
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude/settings.json +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/README.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/skills/README.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/skills/benchmark-swe-lite/SKILL.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/skills/mcpbr-config/SKILL.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/skills/mcpbr-eval/SKILL.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/dependabot.yml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/release-drafter.yml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/workflows/ci.yml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/workflows/post-release-bump.yml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/workflows/publish-npm.yml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/workflows/publish.yml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/workflows/release-drafter.yml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/AGENTS.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/CLAUDE.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/CODE_OF_CONDUCT.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/CONTRIBUTING.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/Dockerfile +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/HUMANEVAL_FIX_SUMMARY.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/LICENSE +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/Makefile +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/PR_SUMMARY.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/README.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/SECURITY.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/assets/mcpbr-demo.gif +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/assets/mcpbr-eval-results.png +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/assets/mcpbr-logo.jpg +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/bin/mcpbr.js +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/config/example.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/config/humaneval.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/config/supermodel.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/azure-config-example.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/custom-benchmark.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/env-vars-example.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/README.md +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/base-config.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/dev-config.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/multi-extend-config.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/production-config.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/shared-mcp-settings.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/local-config-example.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/quick-start/gsm8k-math-reasoning.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/quick-start/test-your-mcp-server.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/install.sh +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/requirements.txt +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/scripts/sync_version.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/scripts/validate_plugin_manifests.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/__main__.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/agent.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/__init__.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/adversarial.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/agentbench.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/aider_polyglot.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/apps.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/arc.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/base.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/bigbench_hard.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/bigcodebench.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/codecontests.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/codereval.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/custom.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/cybergym.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/gaia.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/gsm8k.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/hellaswag.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/humaneval.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/intercode.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/leetcode.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/longbench.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/math_benchmark.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/mbpp.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/mcptoolbench.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/mlagentbench.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/mmmu.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/repoqa.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/swebench.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/terminalbench.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/toolbench.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/truthfulqa.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/webarena.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/cache.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/cli.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/config_inheritance.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/config_validator.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/custom_metrics.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/dataset_versioning.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/env_expansion.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/evaluation.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/failure_analysis.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/few_shot.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/harnesses.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/incremental_save.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/__init__.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/azure.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/azure_health.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/base.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/local.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/manager.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/junit_reporter.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/latency_metrics.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/log_formatter.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/output_validator.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/profiler.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/regression.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/reporting.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/sampling.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/schema.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/state_tracker.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/statistics.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/streaming.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/swebench_test_specs.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/templates.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/__init__.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/__init__.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_azure.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_azure_health.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_base.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_cli_infrastructure.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_config.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_local.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_manager.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_agent.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_benchmark_filtering.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_benchmark_integration.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_benchmarks.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_build_test_command.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_cache.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_claude_plugin.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_cli_templates.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_comparison_aggregation.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_comparison_config.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_comparison_integration.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_comparison_reporting.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_config.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_config_env_vars.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_config_inheritance.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_config_validator.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_config_validator_inheritance.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_cost_calculation.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_custom_metrics.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_dataset_versioning.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_default_logging.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_docker_cleanup.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_docker_label_fix.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_docker_retry.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_env_expansion.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_error_messages.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_evaluation.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_exit_codes.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_export.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_failure_analysis.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_few_shot.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_git_diff_new_files.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_gpu_support.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_incremental_save.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_integration.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_junit_reporter.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_latency_metrics.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_log_formatter_read_tool.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_longbench_benchmark.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_mcp_health_check.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_mcp_logging.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_mcptoolbench_benchmark.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_mmmu_benchmark.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_output_validator.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_parse_errors.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_preflight.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_pricing.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_profiler.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_regression.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_reporting.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_runtime_tracking.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_sampling.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_schema.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_smoke_test.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_state_tracker.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_statistics.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_statistics_integration.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_streaming.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_string_concat_bug.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_templates.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_thinking_budget.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_timeout_tracking.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_tool_failure_tracking.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_trial_mode.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_type_safety.py +0 -0
- {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_xml_export.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "https://anthropic.com/claude-code/marketplace.schema.json",
|
|
3
3
|
"name": "mcpbr",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.6.0",
|
|
5
5
|
"description": "mcpbr - MCP Benchmark Runner plugin marketplace",
|
|
6
6
|
"owner": {
|
|
7
7
|
"name": "mcpbr Contributors",
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
{
|
|
12
12
|
"name": "mcpbr",
|
|
13
13
|
"description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
|
|
14
|
-
"version": "0.
|
|
14
|
+
"version": "0.6.0",
|
|
15
15
|
"author": {
|
|
16
16
|
"name": "mcpbr Contributors"
|
|
17
17
|
},
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Git
|
|
2
|
+
.git
|
|
3
|
+
.gitignore
|
|
4
|
+
|
|
5
|
+
# Python
|
|
6
|
+
__pycache__
|
|
7
|
+
*.pyc
|
|
8
|
+
*.pyo
|
|
9
|
+
*.egg-info
|
|
10
|
+
dist/
|
|
11
|
+
build/
|
|
12
|
+
.eggs/
|
|
13
|
+
*.egg
|
|
14
|
+
|
|
15
|
+
# Virtual environments
|
|
16
|
+
.venv
|
|
17
|
+
venv
|
|
18
|
+
env
|
|
19
|
+
|
|
20
|
+
# IDE
|
|
21
|
+
.vscode
|
|
22
|
+
.idea
|
|
23
|
+
*.swp
|
|
24
|
+
*.swo
|
|
25
|
+
|
|
26
|
+
# Testing
|
|
27
|
+
.pytest_cache
|
|
28
|
+
.coverage
|
|
29
|
+
htmlcov/
|
|
30
|
+
.tox
|
|
31
|
+
|
|
32
|
+
# Documentation
|
|
33
|
+
docs/
|
|
34
|
+
site/
|
|
35
|
+
mkdocs.yml
|
|
36
|
+
|
|
37
|
+
# CI/CD
|
|
38
|
+
.github/
|
|
39
|
+
ci-templates/
|
|
40
|
+
|
|
41
|
+
# Docker (avoid recursive copies)
|
|
42
|
+
docker/results/
|
|
43
|
+
|
|
44
|
+
# Node
|
|
45
|
+
node_modules/
|
|
46
|
+
npm-debug.log
|
|
47
|
+
|
|
48
|
+
# OS
|
|
49
|
+
.DS_Store
|
|
50
|
+
Thumbs.db
|
|
51
|
+
|
|
52
|
+
# Secrets
|
|
53
|
+
.env
|
|
54
|
+
.env.*
|
|
55
|
+
credentials.json
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
name: Build Binaries
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
inputs:
|
|
8
|
+
tag:
|
|
9
|
+
description: "Release tag to build (e.g., v0.6.0)"
|
|
10
|
+
required: false
|
|
11
|
+
type: string
|
|
12
|
+
|
|
13
|
+
permissions:
|
|
14
|
+
contents: write
|
|
15
|
+
|
|
16
|
+
jobs:
|
|
17
|
+
build:
|
|
18
|
+
runs-on: ${{ matrix.os }}
|
|
19
|
+
strategy:
|
|
20
|
+
fail-fast: false
|
|
21
|
+
matrix:
|
|
22
|
+
include:
|
|
23
|
+
- os: ubuntu-latest
|
|
24
|
+
platform: linux
|
|
25
|
+
arch: x86_64
|
|
26
|
+
artifact-name: mcpbr-linux-x86_64
|
|
27
|
+
- os: macos-latest
|
|
28
|
+
platform: macos
|
|
29
|
+
arch: arm64
|
|
30
|
+
artifact-name: mcpbr-macos-arm64
|
|
31
|
+
- os: macos-13-large
|
|
32
|
+
platform: macos
|
|
33
|
+
arch: x86_64
|
|
34
|
+
artifact-name: mcpbr-macos-x86_64
|
|
35
|
+
- os: windows-latest
|
|
36
|
+
platform: windows
|
|
37
|
+
arch: x86_64
|
|
38
|
+
artifact-name: mcpbr-windows-x86_64
|
|
39
|
+
|
|
40
|
+
steps:
|
|
41
|
+
- uses: actions/checkout@v4
|
|
42
|
+
with:
|
|
43
|
+
ref: ${{ github.event.inputs.tag || github.ref }}
|
|
44
|
+
|
|
45
|
+
- name: Set up Python
|
|
46
|
+
uses: actions/setup-python@v5
|
|
47
|
+
with:
|
|
48
|
+
python-version: "3.11"
|
|
49
|
+
|
|
50
|
+
- name: Install dependencies
|
|
51
|
+
run: |
|
|
52
|
+
python -m pip install --upgrade pip
|
|
53
|
+
pip install pyinstaller
|
|
54
|
+
pip install -e "."
|
|
55
|
+
|
|
56
|
+
- name: Build binary (Unix)
|
|
57
|
+
if: matrix.platform != 'windows'
|
|
58
|
+
run: |
|
|
59
|
+
pyinstaller mcpbr.spec
|
|
60
|
+
cd dist
|
|
61
|
+
tar -czf ${{ matrix.artifact-name }}.tar.gz mcpbr
|
|
62
|
+
cd ..
|
|
63
|
+
|
|
64
|
+
- name: Build binary (Windows)
|
|
65
|
+
if: matrix.platform == 'windows'
|
|
66
|
+
run: |
|
|
67
|
+
pyinstaller mcpbr.spec
|
|
68
|
+
cd dist
|
|
69
|
+
Compress-Archive -Path mcpbr.exe -DestinationPath ${{ matrix.artifact-name }}.zip
|
|
70
|
+
cd ..
|
|
71
|
+
shell: pwsh
|
|
72
|
+
|
|
73
|
+
- name: Upload artifact
|
|
74
|
+
uses: actions/upload-artifact@v4
|
|
75
|
+
with:
|
|
76
|
+
name: ${{ matrix.artifact-name }}
|
|
77
|
+
path: dist/mcpbr-*.*
|
|
78
|
+
|
|
79
|
+
- name: Upload to release
|
|
80
|
+
if: github.event_name == 'release'
|
|
81
|
+
uses: softprops/action-gh-release@v2
|
|
82
|
+
with:
|
|
83
|
+
files: dist/mcpbr-*.*
|
|
@@ -7,6 +7,59 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.6.0] - 2026-02-05
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- **Graceful degradation** (#70): Fault-tolerant task execution with failure isolation, classification (transient/permanent/unknown), configurable `continue_on_error` and `max_failures` policies, execution checkpointing for crash recovery, and partial report generation
|
|
15
|
+
- New config fields: `continue_on_error`, `max_failures`, `checkpoint_interval`, `resume_from_checkpoint`
|
|
16
|
+
- **Multi-provider support** (#229): Added OpenAI, Google Gemini, and Alibaba Qwen as model providers alongside Anthropic
|
|
17
|
+
- `OpenAIProvider` for GPT-4o, GPT-4 Turbo, and GPT-4o Mini models
|
|
18
|
+
- `GeminiProvider` for Gemini 2.0 Flash, Gemini 1.5 Pro, and Gemini 1.5 Flash models
|
|
19
|
+
- `QwenProvider` for Qwen Plus, Qwen Turbo, and Qwen Max models via DashScope API
|
|
20
|
+
- New optional dependencies: `openai`, `gemini`, `all-providers` extras
|
|
21
|
+
- Pricing data for all 9 new models
|
|
22
|
+
- Model registry entries with context window and tool support metadata
|
|
23
|
+
- **Multi-language support** (#230): Cross-language benchmark execution for Python, JavaScript, TypeScript, Java, and Go
|
|
24
|
+
- Per-language Docker images, run/compile commands, and test framework configs
|
|
25
|
+
- Automatic language detection from filenames and code patterns
|
|
26
|
+
- Cross-language metrics aggregation
|
|
27
|
+
- **Structured logging** (#231): JSON and human-readable log formatters with contextual metadata
|
|
28
|
+
- File rotation, configurable log levels via `MCPBR_LOG_LEVEL` env var
|
|
29
|
+
- `LogContext` context manager for injecting task/benchmark fields into log records
|
|
30
|
+
- **Public Python SDK** (#232): Programmatic API for configuring and running benchmarks
|
|
31
|
+
- `MCPBenchmark` class with config from dict, YAML, or `HarnessConfig`
|
|
32
|
+
- `list_benchmarks()`, `list_providers()`, `list_models()`, `get_version()` helpers
|
|
33
|
+
- Exported in top-level `mcpbr` package for `from mcpbr import MCPBenchmark`
|
|
34
|
+
- **Platform distribution files**: Docker, Conda, Homebrew, GitHub Actions, and CI templates
|
|
35
|
+
- `Dockerfile.app` multi-stage build for container deployment
|
|
36
|
+
- `docker/docker-compose.yml` for multi-container orchestration
|
|
37
|
+
- `conda/meta.yaml` recipe for Conda packaging
|
|
38
|
+
- `action/action.yml` GitHub Action with basic and matrix examples
|
|
39
|
+
- `ci-templates/` for GitLab CI and CircleCI integration
|
|
40
|
+
|
|
41
|
+
## [0.5.0] - 2026-02-05
|
|
42
|
+
|
|
43
|
+
### Added
|
|
44
|
+
|
|
45
|
+
- **Real-time web dashboard** (#58): Live monitoring of benchmark evaluations via `DashboardServer` with FastAPI + WebSocket, task progress, resolution rate, ETA, and pause/resume/cancel controls
|
|
46
|
+
- **Interactive configuration wizard** (#74): Step-by-step CLI wizard for creating config files with presets (filesystem, web-search, database), model/benchmark selection, and MCP server setup
|
|
47
|
+
- **Dry-run mode** (#84): Preview evaluation plan without executing — shows tasks, estimated cost/time, validates config, checks Docker and MCP server availability
|
|
48
|
+
- **Task prioritization and scheduling** (#92): Intelligent task ordering with speed-first, cost-first, coverage-first, and custom scoring strategies
|
|
49
|
+
- **Color and formatting options** (#105): Configurable output themes (default, minimal, plain) with NO_COLOR convention support and MCPBR_THEME env var
|
|
50
|
+
- **Docker image pre-warming** (#128): Pre-pull Docker images in parallel before evaluation starts with progress reporting and cache detection
|
|
51
|
+
- **Result streaming to external storage** (#131): Stream results as tasks complete to local JSONL files, S3-compatible storage, or webhooks with buffering and retry
|
|
52
|
+
- **Memory-efficient large dataset handling** (#134): Streaming and chunked loading of large HuggingFace datasets with memory monitoring and automatic chunk-size adaptation
|
|
53
|
+
- **Task batching with smart scheduling** (#137): Group similar tasks by repo/image/category to minimize Docker container restarts with adaptive batch sizing
|
|
54
|
+
- **Resource limit configuration** (#139): Configure CPU, memory, disk, PID, and network limits for Docker containers with monitoring and violation reporting
|
|
55
|
+
- **Configuration migration tool** (#195): Detect and migrate old config formats (V1→V4) with dry-run preview, backup, and chained migration steps
|
|
56
|
+
- **Docker image caching optimization** (#228): LRU cache management with size limits, usage tracking, eviction, warmup recommendations, and dangling image cleanup
|
|
57
|
+
|
|
58
|
+
### Fixed
|
|
59
|
+
|
|
60
|
+
- **Zero-cost metrics on evaluation timeout** (#374): Agent metrics (cost, tokens, iterations) were discarded when `benchmark.evaluate()` timed out after the agent had already completed successfully. Now preserves agent results when available.
|
|
61
|
+
- **Process hang after evaluation completes** (#374): `asyncio.run()` blocked indefinitely during cleanup because Docker SDK urllib3 background threads kept the default executor alive. Now force-shuts down the executor with `wait=False`.
|
|
62
|
+
|
|
10
63
|
## [0.4.16] - 2026-02-05
|
|
11
64
|
|
|
12
65
|
### Added
|
|
@@ -729,6 +782,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
729
782
|
[0.3.14]: https://github.com/greynewell/mcpbr/releases/tag/v0.3.14
|
|
730
783
|
[0.3.13]: https://github.com/greynewell/mcpbr/releases/tag/v0.3.13
|
|
731
784
|
[0.3.12]: https://github.com/greynewell/mcpbr/releases/tag/v0.3.12
|
|
785
|
+
[0.6.0]: https://github.com/greynewell/mcpbr/releases/tag/v0.6.0
|
|
786
|
+
[0.5.0]: https://github.com/greynewell/mcpbr/releases/tag/v0.5.0
|
|
732
787
|
[0.4.16]: https://github.com/greynewell/mcpbr/releases/tag/v0.4.16
|
|
733
788
|
[0.3.11]: https://github.com/greynewell/mcpbr/releases/tag/v0.3.11
|
|
734
789
|
[0.3.10]: https://github.com/greynewell/mcpbr/releases/tag/v0.3.10
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Multi-stage Dockerfile for running mcpbr CLI
|
|
2
|
+
# This is NOT the task environment Dockerfile - see Dockerfile for that.
|
|
3
|
+
# This image packages the mcpbr benchmark runner itself.
|
|
4
|
+
|
|
5
|
+
# Stage 1: Build
|
|
6
|
+
FROM python:3.11-slim AS builder
|
|
7
|
+
|
|
8
|
+
WORKDIR /build
|
|
9
|
+
|
|
10
|
+
# Install build dependencies
|
|
11
|
+
RUN pip install --no-cache-dir build hatchling
|
|
12
|
+
|
|
13
|
+
# Copy project files
|
|
14
|
+
COPY pyproject.toml README.md LICENSE ./
|
|
15
|
+
COPY src/ src/
|
|
16
|
+
|
|
17
|
+
# Build the wheel
|
|
18
|
+
RUN python -m build --wheel --outdir /build/dist
|
|
19
|
+
|
|
20
|
+
# Stage 2: Runtime
|
|
21
|
+
FROM python:3.11-slim
|
|
22
|
+
|
|
23
|
+
LABEL maintainer="mcpbr Contributors"
|
|
24
|
+
LABEL org.opencontainers.image.source="https://github.com/greynewell/mcpbr"
|
|
25
|
+
LABEL org.opencontainers.image.description="MCP Benchmark Runner - evaluate MCP servers against software engineering benchmarks"
|
|
26
|
+
LABEL org.opencontainers.image.licenses="MIT"
|
|
27
|
+
|
|
28
|
+
# Install runtime system dependencies
|
|
29
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
30
|
+
git \
|
|
31
|
+
curl \
|
|
32
|
+
ca-certificates \
|
|
33
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
34
|
+
|
|
35
|
+
# Create non-root user
|
|
36
|
+
RUN groupadd --gid 1000 mcpbr && \
|
|
37
|
+
useradd --uid 1000 --gid mcpbr --shell /bin/bash --create-home mcpbr
|
|
38
|
+
|
|
39
|
+
WORKDIR /home/mcpbr
|
|
40
|
+
|
|
41
|
+
# Install the built wheel from the builder stage
|
|
42
|
+
COPY --from=builder /build/dist/*.whl /tmp/
|
|
43
|
+
RUN pip install --no-cache-dir /tmp/*.whl && \
|
|
44
|
+
rm -f /tmp/*.whl
|
|
45
|
+
|
|
46
|
+
# Copy entrypoint
|
|
47
|
+
COPY docker/entrypoint.sh /usr/local/bin/entrypoint.sh
|
|
48
|
+
RUN chmod +x /usr/local/bin/entrypoint.sh
|
|
49
|
+
|
|
50
|
+
# Create directories for configs and results
|
|
51
|
+
RUN mkdir -p /home/mcpbr/configs /home/mcpbr/results && \
|
|
52
|
+
chown -R mcpbr:mcpbr /home/mcpbr
|
|
53
|
+
|
|
54
|
+
# Switch to non-root user
|
|
55
|
+
USER mcpbr
|
|
56
|
+
|
|
57
|
+
# Mount points for user configs and results
|
|
58
|
+
VOLUME ["/home/mcpbr/configs", "/home/mcpbr/results"]
|
|
59
|
+
|
|
60
|
+
ENTRYPOINT ["entrypoint.sh"]
|
|
61
|
+
CMD ["--help"]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
class Mcpbr < Formula
|
|
2
|
+
include Language::Python::Virtualenv
|
|
3
|
+
|
|
4
|
+
desc "Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks"
|
|
5
|
+
homepage "https://github.com/greynewell/mcpbr"
|
|
6
|
+
# NOTE: Update URL and sha256 when publishing a release.
|
|
7
|
+
# Run: curl -sL <url> | shasum -a 256
|
|
8
|
+
url "https://files.pythonhosted.org/packages/source/m/mcpbr/mcpbr-0.6.0.tar.gz"
|
|
9
|
+
sha256 "PLACEHOLDER_SHA256_REPLACE_ON_RELEASE"
|
|
10
|
+
license "MIT"
|
|
11
|
+
|
|
12
|
+
depends_on "python@3.11"
|
|
13
|
+
|
|
14
|
+
resource "anthropic" do
|
|
15
|
+
url "https://files.pythonhosted.org/packages/source/a/anthropic/anthropic-0.40.0.tar.gz"
|
|
16
|
+
sha256 "PLACEHOLDER_SHA256"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
resource "click" do
|
|
20
|
+
url "https://files.pythonhosted.org/packages/source/c/click/click-8.1.7.tar.gz"
|
|
21
|
+
sha256 "PLACEHOLDER_SHA256"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
resource "docker" do
|
|
25
|
+
url "https://files.pythonhosted.org/packages/source/d/docker/docker-7.0.0.tar.gz"
|
|
26
|
+
sha256 "PLACEHOLDER_SHA256"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
resource "pydantic" do
|
|
30
|
+
url "https://files.pythonhosted.org/packages/source/p/pydantic/pydantic-2.0.0.tar.gz"
|
|
31
|
+
sha256 "PLACEHOLDER_SHA256"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
resource "pyyaml" do
|
|
35
|
+
url "https://files.pythonhosted.org/packages/source/p/PyYAML/PyYAML-6.0.1.tar.gz"
|
|
36
|
+
sha256 "PLACEHOLDER_SHA256"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
resource "rich" do
|
|
40
|
+
url "https://files.pythonhosted.org/packages/source/r/rich/rich-13.0.0.tar.gz"
|
|
41
|
+
sha256 "PLACEHOLDER_SHA256"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def install
|
|
45
|
+
virtualenv_install_with_resources
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
test do
|
|
49
|
+
assert_match "mcpbr", shell_output("#{bin}/mcpbr --version")
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcpbr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
|
|
5
5
|
Project-URL: Homepage, https://github.com/greynewell/mcpbr
|
|
6
6
|
Project-URL: Repository, https://github.com/greynewell/mcpbr
|
|
@@ -30,6 +30,9 @@ Requires-Dist: pydantic>=2.0.0
|
|
|
30
30
|
Requires-Dist: pyyaml>=6.0.0
|
|
31
31
|
Requires-Dist: requests>=2.31.0
|
|
32
32
|
Requires-Dist: rich>=13.0.0
|
|
33
|
+
Provides-Extra: all-providers
|
|
34
|
+
Requires-Dist: google-generativeai>=0.3.0; extra == 'all-providers'
|
|
35
|
+
Requires-Dist: openai>=1.0.0; extra == 'all-providers'
|
|
33
36
|
Provides-Extra: dev
|
|
34
37
|
Requires-Dist: pre-commit>=3.0.0; extra == 'dev'
|
|
35
38
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
@@ -40,6 +43,10 @@ Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
|
|
|
40
43
|
Requires-Dist: mkdocs-minify-plugin>=0.7.0; extra == 'docs'
|
|
41
44
|
Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
|
|
42
45
|
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
|
|
46
|
+
Provides-Extra: gemini
|
|
47
|
+
Requires-Dist: google-generativeai>=0.3.0; extra == 'gemini'
|
|
48
|
+
Provides-Extra: openai
|
|
49
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
43
50
|
Description-Content-Type: text/markdown
|
|
44
51
|
|
|
45
52
|
# mcpbr
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# mcpbr GitHub Action
|
|
2
|
+
|
|
3
|
+
A composite GitHub Action to run MCP server benchmarks in your CI/CD pipeline.
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
|
|
7
|
+
```yaml
|
|
8
|
+
- uses: greynewell/mcpbr@main
|
|
9
|
+
with:
|
|
10
|
+
config: benchmarks/config.yaml
|
|
11
|
+
anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Inputs
|
|
15
|
+
|
|
16
|
+
| Input | Description | Required | Default |
|
|
17
|
+
|---|---|---|---|
|
|
18
|
+
| `config` | Path to benchmark configuration YAML | Yes | - |
|
|
19
|
+
| `version` | mcpbr version to install | No | `latest` |
|
|
20
|
+
| `python-version` | Python version to use | No | `3.11` |
|
|
21
|
+
| `output-dir` | Directory for results | No | `results` |
|
|
22
|
+
| `extra-args` | Additional CLI arguments | No | `""` |
|
|
23
|
+
| `anthropic-api-key` | Anthropic API key | No | - |
|
|
24
|
+
| `openai-api-key` | OpenAI API key | No | - |
|
|
25
|
+
|
|
26
|
+
## Outputs
|
|
27
|
+
|
|
28
|
+
| Output | Description |
|
|
29
|
+
|---|---|
|
|
30
|
+
| `results-path` | Path to the results directory |
|
|
31
|
+
| `exit-code` | Exit code from the benchmark run |
|
|
32
|
+
|
|
33
|
+
## Examples
|
|
34
|
+
|
|
35
|
+
See the [examples](examples/) directory for:
|
|
36
|
+
|
|
37
|
+
- [basic.yml](examples/basic.yml) - Simple benchmark run on push
|
|
38
|
+
- [matrix.yml](examples/matrix.yml) - Matrix builds across benchmarks and Python versions
|
|
39
|
+
|
|
40
|
+
## Security
|
|
41
|
+
|
|
42
|
+
Always use GitHub Secrets for API keys. Never hardcode them in workflow files.
|
|
43
|
+
|
|
44
|
+
```yaml
|
|
45
|
+
anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
46
|
+
```
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
name: "mcpbr Benchmark Runner"
|
|
2
|
+
description: "Run MCP server benchmarks in your CI/CD pipeline using mcpbr"
|
|
3
|
+
author: "mcpbr Contributors"
|
|
4
|
+
|
|
5
|
+
branding:
|
|
6
|
+
icon: "bar-chart-2"
|
|
7
|
+
color: "blue"
|
|
8
|
+
|
|
9
|
+
inputs:
|
|
10
|
+
config:
|
|
11
|
+
description: "Path to benchmark configuration YAML file"
|
|
12
|
+
required: true
|
|
13
|
+
version:
|
|
14
|
+
description: "Version of mcpbr to install (e.g., '0.6.0' or 'latest')"
|
|
15
|
+
required: false
|
|
16
|
+
default: "latest"
|
|
17
|
+
python-version:
|
|
18
|
+
description: "Python version to use"
|
|
19
|
+
required: false
|
|
20
|
+
default: "3.11"
|
|
21
|
+
output-dir:
|
|
22
|
+
description: "Directory for benchmark results"
|
|
23
|
+
required: false
|
|
24
|
+
default: "results"
|
|
25
|
+
extra-args:
|
|
26
|
+
description: "Additional arguments to pass to mcpbr run"
|
|
27
|
+
required: false
|
|
28
|
+
default: ""
|
|
29
|
+
anthropic-api-key:
|
|
30
|
+
description: "Anthropic API key (prefer using secrets)"
|
|
31
|
+
required: false
|
|
32
|
+
openai-api-key:
|
|
33
|
+
description: "OpenAI API key (prefer using secrets)"
|
|
34
|
+
required: false
|
|
35
|
+
|
|
36
|
+
outputs:
|
|
37
|
+
results-path:
|
|
38
|
+
description: "Path to the results directory"
|
|
39
|
+
value: ${{ steps.run-benchmark.outputs.results-path }}
|
|
40
|
+
exit-code:
|
|
41
|
+
description: "Exit code from mcpbr run"
|
|
42
|
+
value: ${{ steps.run-benchmark.outputs.exit-code }}
|
|
43
|
+
|
|
44
|
+
runs:
|
|
45
|
+
using: "composite"
|
|
46
|
+
steps:
|
|
47
|
+
- name: Set up Python
|
|
48
|
+
uses: actions/setup-python@v5
|
|
49
|
+
with:
|
|
50
|
+
python-version: ${{ inputs.python-version }}
|
|
51
|
+
|
|
52
|
+
- name: Install mcpbr
|
|
53
|
+
shell: bash
|
|
54
|
+
run: |
|
|
55
|
+
python -m pip install --upgrade pip
|
|
56
|
+
if [ "${{ inputs.version }}" = "latest" ]; then
|
|
57
|
+
pip install mcpbr
|
|
58
|
+
else
|
|
59
|
+
pip install "mcpbr==${{ inputs.version }}"
|
|
60
|
+
fi
|
|
61
|
+
|
|
62
|
+
- name: Create output directory
|
|
63
|
+
shell: bash
|
|
64
|
+
run: mkdir -p "${{ inputs.output-dir }}"
|
|
65
|
+
|
|
66
|
+
- name: Run benchmarks
|
|
67
|
+
id: run-benchmark
|
|
68
|
+
shell: bash
|
|
69
|
+
env:
|
|
70
|
+
ANTHROPIC_API_KEY: ${{ inputs.anthropic-api-key }}
|
|
71
|
+
OPENAI_API_KEY: ${{ inputs.openai-api-key }}
|
|
72
|
+
run: |
|
|
73
|
+
set +e
|
|
74
|
+
mcpbr run \
|
|
75
|
+
--config "${{ inputs.config }}" \
|
|
76
|
+
--output-dir "${{ inputs.output-dir }}" \
|
|
77
|
+
${{ inputs.extra-args }}
|
|
78
|
+
EXIT_CODE=$?
|
|
79
|
+
set -e
|
|
80
|
+
echo "exit-code=${EXIT_CODE}" >> "$GITHUB_OUTPUT"
|
|
81
|
+
echo "results-path=${{ inputs.output-dir }}" >> "$GITHUB_OUTPUT"
|
|
82
|
+
exit ${EXIT_CODE}
|
|
83
|
+
|
|
84
|
+
- name: Upload results artifact
|
|
85
|
+
if: always()
|
|
86
|
+
uses: actions/upload-artifact@v4
|
|
87
|
+
with:
|
|
88
|
+
name: mcpbr-results
|
|
89
|
+
path: ${{ inputs.output-dir }}
|
|
90
|
+
retention-days: 30
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Basic example: Run mcpbr benchmarks on push to main
|
|
2
|
+
name: MCP Benchmark
|
|
3
|
+
|
|
4
|
+
on:
|
|
5
|
+
push:
|
|
6
|
+
branches: [main]
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
benchmark:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- name: Run MCP benchmarks
|
|
16
|
+
uses: greynewell/mcpbr@main
|
|
17
|
+
with:
|
|
18
|
+
config: benchmarks/config.yaml
|
|
19
|
+
anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
20
|
+
output-dir: results
|
|
21
|
+
|
|
22
|
+
- name: Check results
|
|
23
|
+
run: |
|
|
24
|
+
echo "Benchmark results saved to results/"
|
|
25
|
+
ls -la results/
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Matrix example: Run benchmarks across multiple configurations
|
|
2
|
+
name: MCP Benchmark Matrix
|
|
3
|
+
|
|
4
|
+
on:
|
|
5
|
+
pull_request:
|
|
6
|
+
branches: [main]
|
|
7
|
+
schedule:
|
|
8
|
+
- cron: "0 6 * * 1" # Weekly on Monday at 6am UTC
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
benchmark:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
fail-fast: false
|
|
15
|
+
matrix:
|
|
16
|
+
benchmark:
|
|
17
|
+
- swebench
|
|
18
|
+
- humaneval
|
|
19
|
+
- mbpp
|
|
20
|
+
python-version:
|
|
21
|
+
- "3.11"
|
|
22
|
+
- "3.12"
|
|
23
|
+
|
|
24
|
+
steps:
|
|
25
|
+
- uses: actions/checkout@v4
|
|
26
|
+
|
|
27
|
+
- name: Run ${{ matrix.benchmark }} benchmark
|
|
28
|
+
uses: greynewell/mcpbr@main
|
|
29
|
+
with:
|
|
30
|
+
config: benchmarks/${{ matrix.benchmark }}.yaml
|
|
31
|
+
python-version: ${{ matrix.python-version }}
|
|
32
|
+
anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
33
|
+
output-dir: results-${{ matrix.benchmark }}-py${{ matrix.python-version }}
|
|
34
|
+
|
|
35
|
+
- name: Upload individual results
|
|
36
|
+
uses: actions/upload-artifact@v4
|
|
37
|
+
with:
|
|
38
|
+
name: results-${{ matrix.benchmark }}-py${{ matrix.python-version }}
|
|
39
|
+
path: results-${{ matrix.benchmark }}-py${{ matrix.python-version }}
|
|
40
|
+
|
|
41
|
+
aggregate:
|
|
42
|
+
needs: benchmark
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
if: always()
|
|
45
|
+
steps:
|
|
46
|
+
- name: Download all results
|
|
47
|
+
uses: actions/download-artifact@v4
|
|
48
|
+
with:
|
|
49
|
+
path: all-results
|
|
50
|
+
|
|
51
|
+
- name: Summary
|
|
52
|
+
run: |
|
|
53
|
+
echo "## Benchmark Results" >> $GITHUB_STEP_SUMMARY
|
|
54
|
+
for dir in all-results/results-*; do
|
|
55
|
+
benchmark=$(basename "$dir")
|
|
56
|
+
echo "### ${benchmark}" >> $GITHUB_STEP_SUMMARY
|
|
57
|
+
for json_file in "${dir}"/*.json; do
|
|
58
|
+
if [ -f "$json_file" ]; then
|
|
59
|
+
cat "$json_file" >> $GITHUB_STEP_SUMMARY
|
|
60
|
+
fi
|
|
61
|
+
done
|
|
62
|
+
done
|