mcpbr 0.4.16__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (279) hide show
  1. {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/marketplace.json +2 -2
  2. {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/package.json +1 -1
  3. {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/plugin.json +1 -1
  4. mcpbr-0.6.0/.dockerignore +55 -0
  5. mcpbr-0.6.0/.gitattributes +2 -0
  6. mcpbr-0.6.0/.github/workflows/build-binaries.yml +83 -0
  7. {mcpbr-0.4.16 → mcpbr-0.6.0}/.gitignore +1 -0
  8. {mcpbr-0.4.16 → mcpbr-0.6.0}/.pre-commit-config.yaml +1 -0
  9. {mcpbr-0.4.16 → mcpbr-0.6.0}/CHANGELOG.md +55 -0
  10. mcpbr-0.6.0/Dockerfile.app +61 -0
  11. mcpbr-0.6.0/Formula/mcpbr.rb +51 -0
  12. {mcpbr-0.4.16 → mcpbr-0.6.0}/PKG-INFO +8 -1
  13. mcpbr-0.6.0/action/README.md +46 -0
  14. mcpbr-0.6.0/action/action.yml +90 -0
  15. mcpbr-0.6.0/action/examples/basic.yml +25 -0
  16. mcpbr-0.6.0/action/examples/matrix.yml +62 -0
  17. mcpbr-0.6.0/ci-templates/circleci/README.md +56 -0
  18. mcpbr-0.6.0/ci-templates/circleci/orb.yml +134 -0
  19. mcpbr-0.6.0/ci-templates/gitlab/.gitlab-ci-mcpbr.yml +69 -0
  20. mcpbr-0.6.0/ci-templates/gitlab/README.md +44 -0
  21. mcpbr-0.6.0/conda/README.md +54 -0
  22. mcpbr-0.6.0/conda/bld.bat +3 -0
  23. mcpbr-0.6.0/conda/build.sh +5 -0
  24. mcpbr-0.6.0/conda/meta.yaml +59 -0
  25. mcpbr-0.6.0/docker/README.md +59 -0
  26. mcpbr-0.6.0/docker/docker-compose.yml +29 -0
  27. mcpbr-0.6.0/docker/entrypoint.sh +16 -0
  28. mcpbr-0.6.0/homebrew/README.md +39 -0
  29. mcpbr-0.6.0/mcpbr.spec +83 -0
  30. {mcpbr-0.4.16 → mcpbr-0.6.0}/package.json +1 -1
  31. {mcpbr-0.4.16 → mcpbr-0.6.0}/pyproject.toml +4 -1
  32. mcpbr-0.6.0/src/mcpbr/__init__.py +25 -0
  33. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/config.py +37 -1
  34. mcpbr-0.6.0/src/mcpbr/config_migration.py +470 -0
  35. mcpbr-0.6.0/src/mcpbr/config_wizard.py +647 -0
  36. mcpbr-0.6.0/src/mcpbr/dashboard.py +619 -0
  37. mcpbr-0.6.0/src/mcpbr/dataset_streaming.py +491 -0
  38. mcpbr-0.6.0/src/mcpbr/docker_cache.py +539 -0
  39. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/docker_env.py +2 -1
  40. mcpbr-0.6.0/src/mcpbr/docker_prewarm.py +370 -0
  41. mcpbr-0.6.0/src/mcpbr/dry_run.py +533 -0
  42. mcpbr-0.6.0/src/mcpbr/formatting.py +444 -0
  43. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/gpu_support.py +2 -1
  44. mcpbr-0.6.0/src/mcpbr/graceful_degradation.py +277 -0
  45. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/harness.py +38 -4
  46. mcpbr-0.6.0/src/mcpbr/languages.py +228 -0
  47. mcpbr-0.6.0/src/mcpbr/logging_config.py +207 -0
  48. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/models.py +66 -0
  49. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/preflight.py +2 -1
  50. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/pricing.py +72 -0
  51. mcpbr-0.6.0/src/mcpbr/providers.py +549 -0
  52. mcpbr-0.6.0/src/mcpbr/resource_limits.py +487 -0
  53. mcpbr-0.6.0/src/mcpbr/result_streaming.py +519 -0
  54. mcpbr-0.6.0/src/mcpbr/sdk.py +264 -0
  55. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/smoke_test.py +2 -1
  56. mcpbr-0.6.0/src/mcpbr/task_batching.py +403 -0
  57. mcpbr-0.6.0/src/mcpbr/task_scheduler.py +468 -0
  58. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_adversarial_benchmark.py +4 -10
  59. mcpbr-0.6.0/tests/test_config_migration.py +971 -0
  60. mcpbr-0.6.0/tests/test_config_wizard.py +1142 -0
  61. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_custom_benchmark.py +11 -31
  62. mcpbr-0.6.0/tests/test_dashboard.py +748 -0
  63. mcpbr-0.6.0/tests/test_dataset_streaming.py +837 -0
  64. mcpbr-0.6.0/tests/test_docker_cache.py +898 -0
  65. mcpbr-0.6.0/tests/test_docker_prewarm.py +672 -0
  66. mcpbr-0.6.0/tests/test_dry_run.py +833 -0
  67. mcpbr-0.6.0/tests/test_formatting.py +703 -0
  68. mcpbr-0.6.0/tests/test_graceful_degradation.py +621 -0
  69. mcpbr-0.6.0/tests/test_logging_config.py +595 -0
  70. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_models.py +2 -2
  71. mcpbr-0.6.0/tests/test_multi_language.py +454 -0
  72. mcpbr-0.6.0/tests/test_multi_provider.py +625 -0
  73. mcpbr-0.6.0/tests/test_platform_files.py +706 -0
  74. mcpbr-0.6.0/tests/test_resource_limits.py +859 -0
  75. mcpbr-0.6.0/tests/test_result_streaming.py +834 -0
  76. mcpbr-0.6.0/tests/test_sdk.py +515 -0
  77. mcpbr-0.6.0/tests/test_task_batching.py +819 -0
  78. mcpbr-0.6.0/tests/test_task_scheduler.py +643 -0
  79. {mcpbr-0.4.16 → mcpbr-0.6.0}/uv.lock +305 -5
  80. mcpbr-0.4.16/src/mcpbr/__init__.py +0 -6
  81. mcpbr-0.4.16/src/mcpbr/providers.py +0 -236
  82. {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude/settings.json +0 -0
  83. {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/README.md +0 -0
  84. {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/skills/README.md +0 -0
  85. {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/skills/benchmark-swe-lite/SKILL.md +0 -0
  86. {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/skills/mcpbr-config/SKILL.md +0 -0
  87. {mcpbr-0.4.16 → mcpbr-0.6.0}/.claude-plugin/skills/mcpbr-eval/SKILL.md +0 -0
  88. {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  89. {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  90. {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  91. {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  92. {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/dependabot.yml +0 -0
  93. {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/release-drafter.yml +0 -0
  94. {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/workflows/ci.yml +0 -0
  95. {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/workflows/post-release-bump.yml +0 -0
  96. {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/workflows/publish-npm.yml +0 -0
  97. {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/workflows/publish.yml +0 -0
  98. {mcpbr-0.4.16 → mcpbr-0.6.0}/.github/workflows/release-drafter.yml +0 -0
  99. {mcpbr-0.4.16 → mcpbr-0.6.0}/AGENTS.md +0 -0
  100. {mcpbr-0.4.16 → mcpbr-0.6.0}/CLAUDE.md +0 -0
  101. {mcpbr-0.4.16 → mcpbr-0.6.0}/CODE_OF_CONDUCT.md +0 -0
  102. {mcpbr-0.4.16 → mcpbr-0.6.0}/CONTRIBUTING.md +0 -0
  103. {mcpbr-0.4.16 → mcpbr-0.6.0}/Dockerfile +0 -0
  104. {mcpbr-0.4.16 → mcpbr-0.6.0}/HUMANEVAL_FIX_SUMMARY.md +0 -0
  105. {mcpbr-0.4.16 → mcpbr-0.6.0}/LICENSE +0 -0
  106. {mcpbr-0.4.16 → mcpbr-0.6.0}/Makefile +0 -0
  107. {mcpbr-0.4.16 → mcpbr-0.6.0}/PR_SUMMARY.md +0 -0
  108. {mcpbr-0.4.16 → mcpbr-0.6.0}/README.md +0 -0
  109. {mcpbr-0.4.16 → mcpbr-0.6.0}/SECURITY.md +0 -0
  110. {mcpbr-0.4.16 → mcpbr-0.6.0}/assets/mcpbr-demo.gif +0 -0
  111. {mcpbr-0.4.16 → mcpbr-0.6.0}/assets/mcpbr-eval-results.png +0 -0
  112. {mcpbr-0.4.16 → mcpbr-0.6.0}/assets/mcpbr-logo.jpg +0 -0
  113. {mcpbr-0.4.16 → mcpbr-0.6.0}/bin/mcpbr.js +0 -0
  114. {mcpbr-0.4.16 → mcpbr-0.6.0}/config/example.yaml +0 -0
  115. {mcpbr-0.4.16 → mcpbr-0.6.0}/config/humaneval.yaml +0 -0
  116. {mcpbr-0.4.16 → mcpbr-0.6.0}/config/supermodel.yaml +0 -0
  117. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/azure-config-example.yaml +0 -0
  118. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/custom-benchmark.yaml +0 -0
  119. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/env-vars-example.yaml +0 -0
  120. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/README.md +0 -0
  121. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/base-config.yaml +0 -0
  122. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/dev-config.yaml +0 -0
  123. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/multi-extend-config.yaml +0 -0
  124. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/production-config.yaml +0 -0
  125. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/inheritance/shared-mcp-settings.yaml +0 -0
  126. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/local-config-example.yaml +0 -0
  127. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/quick-start/gsm8k-math-reasoning.yaml +0 -0
  128. {mcpbr-0.4.16 → mcpbr-0.6.0}/examples/quick-start/test-your-mcp-server.yaml +0 -0
  129. {mcpbr-0.4.16 → mcpbr-0.6.0}/install.sh +0 -0
  130. {mcpbr-0.4.16 → mcpbr-0.6.0}/requirements.txt +0 -0
  131. {mcpbr-0.4.16 → mcpbr-0.6.0}/scripts/sync_version.py +0 -0
  132. {mcpbr-0.4.16 → mcpbr-0.6.0}/scripts/validate_plugin_manifests.py +0 -0
  133. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/__main__.py +0 -0
  134. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/agent.py +0 -0
  135. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/__init__.py +0 -0
  136. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/adversarial.py +0 -0
  137. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/agentbench.py +0 -0
  138. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/aider_polyglot.py +0 -0
  139. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/apps.py +0 -0
  140. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/arc.py +0 -0
  141. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/base.py +0 -0
  142. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/bigbench_hard.py +0 -0
  143. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/bigcodebench.py +0 -0
  144. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/codecontests.py +0 -0
  145. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/codereval.py +0 -0
  146. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/custom.py +0 -0
  147. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/cybergym.py +0 -0
  148. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/gaia.py +0 -0
  149. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/gsm8k.py +0 -0
  150. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/hellaswag.py +0 -0
  151. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/humaneval.py +0 -0
  152. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/intercode.py +0 -0
  153. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/leetcode.py +0 -0
  154. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/longbench.py +0 -0
  155. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/math_benchmark.py +0 -0
  156. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/mbpp.py +0 -0
  157. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/mcptoolbench.py +0 -0
  158. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/mlagentbench.py +0 -0
  159. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/mmmu.py +0 -0
  160. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/repoqa.py +0 -0
  161. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/swebench.py +0 -0
  162. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/terminalbench.py +0 -0
  163. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/toolbench.py +0 -0
  164. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/truthfulqa.py +0 -0
  165. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/benchmarks/webarena.py +0 -0
  166. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/cache.py +0 -0
  167. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/cli.py +0 -0
  168. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/config_inheritance.py +0 -0
  169. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/config_validator.py +0 -0
  170. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/custom_metrics.py +0 -0
  171. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/brave-search.yaml +0 -0
  172. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/filesystem.yaml +0 -0
  173. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/github.yaml +0 -0
  174. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/google-maps.yaml +0 -0
  175. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/postgres.yaml +0 -0
  176. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/slack.yaml +0 -0
  177. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/data/templates/sqlite.yaml +0 -0
  178. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/dataset_versioning.py +0 -0
  179. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/env_expansion.py +0 -0
  180. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/evaluation.py +0 -0
  181. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/failure_analysis.py +0 -0
  182. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/few_shot.py +0 -0
  183. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/harnesses.py +0 -0
  184. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/incremental_save.py +0 -0
  185. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/__init__.py +0 -0
  186. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/azure.py +0 -0
  187. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/azure_health.py +0 -0
  188. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/base.py +0 -0
  189. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/local.py +0 -0
  190. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/infrastructure/manager.py +0 -0
  191. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/junit_reporter.py +0 -0
  192. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/latency_metrics.py +0 -0
  193. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/log_formatter.py +0 -0
  194. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/output_validator.py +0 -0
  195. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/profiler.py +0 -0
  196. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/regression.py +0 -0
  197. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/reporting.py +0 -0
  198. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/sampling.py +0 -0
  199. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/schema.py +0 -0
  200. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/state_tracker.py +0 -0
  201. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/statistics.py +0 -0
  202. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/streaming.py +0 -0
  203. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/swebench_test_specs.py +0 -0
  204. {mcpbr-0.4.16 → mcpbr-0.6.0}/src/mcpbr/templates.py +0 -0
  205. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/__init__.py +0 -0
  206. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/__init__.py +0 -0
  207. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_azure.py +0 -0
  208. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_azure_health.py +0 -0
  209. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_base.py +0 -0
  210. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_cli_infrastructure.py +0 -0
  211. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_config.py +0 -0
  212. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_local.py +0 -0
  213. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/infrastructure/test_manager.py +0 -0
  214. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_agent.py +0 -0
  215. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_benchmark_filtering.py +0 -0
  216. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_benchmark_integration.py +0 -0
  217. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_benchmarks.py +0 -0
  218. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_build_test_command.py +0 -0
  219. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_cache.py +0 -0
  220. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_claude_plugin.py +0 -0
  221. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_cli_templates.py +0 -0
  222. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_comparison_aggregation.py +0 -0
  223. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_comparison_config.py +0 -0
  224. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_comparison_integration.py +0 -0
  225. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_comparison_reporting.py +0 -0
  226. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_config.py +0 -0
  227. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_config_env_vars.py +0 -0
  228. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_config_inheritance.py +0 -0
  229. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_config_validator.py +0 -0
  230. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_config_validator_inheritance.py +0 -0
  231. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_cost_calculation.py +0 -0
  232. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_custom_metrics.py +0 -0
  233. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_dataset_versioning.py +0 -0
  234. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_default_logging.py +0 -0
  235. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_docker_cleanup.py +0 -0
  236. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_docker_label_fix.py +0 -0
  237. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_docker_retry.py +0 -0
  238. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_env_expansion.py +0 -0
  239. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_error_messages.py +0 -0
  240. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_evaluation.py +0 -0
  241. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_exit_codes.py +0 -0
  242. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_export.py +0 -0
  243. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_failure_analysis.py +0 -0
  244. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_few_shot.py +0 -0
  245. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_git_diff_new_files.py +0 -0
  246. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_gpu_support.py +0 -0
  247. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_incremental_save.py +0 -0
  248. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_integration.py +0 -0
  249. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_junit_reporter.py +0 -0
  250. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_latency_metrics.py +0 -0
  251. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_log_formatter_read_tool.py +0 -0
  252. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_longbench_benchmark.py +0 -0
  253. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_mcp_health_check.py +0 -0
  254. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_mcp_logging.py +0 -0
  255. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_mcptoolbench_benchmark.py +0 -0
  256. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_mmmu_benchmark.py +0 -0
  257. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_output_validator.py +0 -0
  258. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_parse_errors.py +0 -0
  259. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_preflight.py +0 -0
  260. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_pricing.py +0 -0
  261. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_profiler.py +0 -0
  262. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_regression.py +0 -0
  263. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_reporting.py +0 -0
  264. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_runtime_tracking.py +0 -0
  265. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_sampling.py +0 -0
  266. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_schema.py +0 -0
  267. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_smoke_test.py +0 -0
  268. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_state_tracker.py +0 -0
  269. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_statistics.py +0 -0
  270. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_statistics_integration.py +0 -0
  271. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_streaming.py +0 -0
  272. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_string_concat_bug.py +0 -0
  273. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_templates.py +0 -0
  274. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_thinking_budget.py +0 -0
  275. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_timeout_tracking.py +0 -0
  276. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_tool_failure_tracking.py +0 -0
  277. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_trial_mode.py +0 -0
  278. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_type_safety.py +0 -0
  279. {mcpbr-0.4.16 → mcpbr-0.6.0}/tests/test_xml_export.py +0 -0
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "$schema": "https://anthropic.com/claude-code/marketplace.schema.json",
3
3
  "name": "mcpbr",
4
- "version": "0.4.16",
4
+ "version": "0.6.0",
5
5
  "description": "mcpbr - MCP Benchmark Runner plugin marketplace",
6
6
  "owner": {
7
7
  "name": "mcpbr Contributors",
@@ -11,7 +11,7 @@
11
11
  {
12
12
  "name": "mcpbr",
13
13
  "description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
14
- "version": "0.4.16",
14
+ "version": "0.6.0",
15
15
  "author": {
16
16
  "name": "mcpbr Contributors"
17
17
  },
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@greynewell/mcpbr-claude-plugin",
3
- "version": "0.4.16",
3
+ "version": "0.6.0",
4
4
  "description": "Claude Code plugin for mcpbr - Expert benchmark runner for MCP servers with specialized skills",
5
5
  "keywords": [
6
6
  "claude-code",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mcpbr",
3
- "version": "0.4.16",
3
+ "version": "0.6.0",
4
4
  "description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
5
5
  "schema_version": "1.0"
6
6
  }
@@ -0,0 +1,55 @@
1
+ # Git
2
+ .git
3
+ .gitignore
4
+
5
+ # Python
6
+ __pycache__
7
+ *.pyc
8
+ *.pyo
9
+ *.egg-info
10
+ dist/
11
+ build/
12
+ .eggs/
13
+ *.egg
14
+
15
+ # Virtual environments
16
+ .venv
17
+ venv
18
+ env
19
+
20
+ # IDE
21
+ .vscode
22
+ .idea
23
+ *.swp
24
+ *.swo
25
+
26
+ # Testing
27
+ .pytest_cache
28
+ .coverage
29
+ htmlcov/
30
+ .tox
31
+
32
+ # Documentation
33
+ docs/
34
+ site/
35
+ mkdocs.yml
36
+
37
+ # CI/CD
38
+ .github/
39
+ ci-templates/
40
+
41
+ # Docker (avoid recursive copies)
42
+ docker/results/
43
+
44
+ # Node
45
+ node_modules/
46
+ npm-debug.log
47
+
48
+ # OS
49
+ .DS_Store
50
+ Thumbs.db
51
+
52
+ # Secrets
53
+ .env
54
+ .env.*
55
+ credentials.json
@@ -0,0 +1,2 @@
1
+ # Ensure Windows batch files use CRLF line endings
2
+ conda/bld.bat text eol=crlf
@@ -0,0 +1,83 @@
1
+ name: Build Binaries
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch:
7
+ inputs:
8
+ tag:
9
+ description: "Release tag to build (e.g., v0.6.0)"
10
+ required: false
11
+ type: string
12
+
13
+ permissions:
14
+ contents: write
15
+
16
+ jobs:
17
+ build:
18
+ runs-on: ${{ matrix.os }}
19
+ strategy:
20
+ fail-fast: false
21
+ matrix:
22
+ include:
23
+ - os: ubuntu-latest
24
+ platform: linux
25
+ arch: x86_64
26
+ artifact-name: mcpbr-linux-x86_64
27
+ - os: macos-latest
28
+ platform: macos
29
+ arch: arm64
30
+ artifact-name: mcpbr-macos-arm64
31
+ - os: macos-13-large
32
+ platform: macos
33
+ arch: x86_64
34
+ artifact-name: mcpbr-macos-x86_64
35
+ - os: windows-latest
36
+ platform: windows
37
+ arch: x86_64
38
+ artifact-name: mcpbr-windows-x86_64
39
+
40
+ steps:
41
+ - uses: actions/checkout@v4
42
+ with:
43
+ ref: ${{ github.event.inputs.tag || github.ref }}
44
+
45
+ - name: Set up Python
46
+ uses: actions/setup-python@v5
47
+ with:
48
+ python-version: "3.11"
49
+
50
+ - name: Install dependencies
51
+ run: |
52
+ python -m pip install --upgrade pip
53
+ pip install pyinstaller
54
+ pip install -e "."
55
+
56
+ - name: Build binary (Unix)
57
+ if: matrix.platform != 'windows'
58
+ run: |
59
+ pyinstaller mcpbr.spec
60
+ cd dist
61
+ tar -czf ${{ matrix.artifact-name }}.tar.gz mcpbr
62
+ cd ..
63
+
64
+ - name: Build binary (Windows)
65
+ if: matrix.platform == 'windows'
66
+ run: |
67
+ pyinstaller mcpbr.spec
68
+ cd dist
69
+ Compress-Archive -Path mcpbr.exe -DestinationPath ${{ matrix.artifact-name }}.zip
70
+ cd ..
71
+ shell: pwsh
72
+
73
+ - name: Upload artifact
74
+ uses: actions/upload-artifact@v4
75
+ with:
76
+ name: ${{ matrix.artifact-name }}
77
+ path: dist/mcpbr-*.*
78
+
79
+ - name: Upload to release
80
+ if: github.event_name == 'release'
81
+ uses: softprops/action-gh-release@v2
82
+ with:
83
+ files: dist/mcpbr-*.*
@@ -27,6 +27,7 @@ wheels/
27
27
  # PyInstaller
28
28
  *.manifest
29
29
  *.spec
30
+ !mcpbr.spec
30
31
 
31
32
  # Installer logs
32
33
  pip-log.txt
@@ -22,5 +22,6 @@ repos:
22
22
  - id: trailing-whitespace
23
23
  - id: end-of-file-fixer
24
24
  - id: check-yaml
25
+ exclude: ^conda/meta\.yaml$
25
26
  - id: check-added-large-files
26
27
  - id: check-merge-conflict
@@ -7,6 +7,59 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.6.0] - 2026-02-05
11
+
12
+ ### Added
13
+
14
+ - **Graceful degradation** (#70): Fault-tolerant task execution with failure isolation, classification (transient/permanent/unknown), configurable `continue_on_error` and `max_failures` policies, execution checkpointing for crash recovery, and partial report generation
15
+ - New config fields: `continue_on_error`, `max_failures`, `checkpoint_interval`, `resume_from_checkpoint`
16
+ - **Multi-provider support** (#229): Added OpenAI, Google Gemini, and Alibaba Qwen as model providers alongside Anthropic
17
+ - `OpenAIProvider` for GPT-4o, GPT-4 Turbo, and GPT-4o Mini models
18
+ - `GeminiProvider` for Gemini 2.0 Flash, Gemini 1.5 Pro, and Gemini 1.5 Flash models
19
+ - `QwenProvider` for Qwen Plus, Qwen Turbo, and Qwen Max models via DashScope API
20
+ - New optional dependencies: `openai`, `gemini`, `all-providers` extras
21
+ - Pricing data for all 9 new models
22
+ - Model registry entries with context window and tool support metadata
23
+ - **Multi-language support** (#230): Cross-language benchmark execution for Python, JavaScript, TypeScript, Java, and Go
24
+ - Per-language Docker images, run/compile commands, and test framework configs
25
+ - Automatic language detection from filenames and code patterns
26
+ - Cross-language metrics aggregation
27
+ - **Structured logging** (#231): JSON and human-readable log formatters with contextual metadata
28
+ - File rotation, configurable log levels via `MCPBR_LOG_LEVEL` env var
29
+ - `LogContext` context manager for injecting task/benchmark fields into log records
30
+ - **Public Python SDK** (#232): Programmatic API for configuring and running benchmarks
31
+ - `MCPBenchmark` class with config from dict, YAML, or `HarnessConfig`
32
+ - `list_benchmarks()`, `list_providers()`, `list_models()`, `get_version()` helpers
33
+ - Exported in top-level `mcpbr` package for `from mcpbr import MCPBenchmark`
34
+ - **Platform distribution files**: Docker, Conda, Homebrew, GitHub Actions, and CI templates
35
+ - `Dockerfile.app` multi-stage build for container deployment
36
+ - `docker/docker-compose.yml` for multi-container orchestration
37
+ - `conda/meta.yaml` recipe for Conda packaging
38
+ - `action/action.yml` GitHub Action with basic and matrix examples
39
+ - `ci-templates/` for GitLab CI and CircleCI integration
40
+
41
+ ## [0.5.0] - 2026-02-05
42
+
43
+ ### Added
44
+
45
+ - **Real-time web dashboard** (#58): Live monitoring of benchmark evaluations via `DashboardServer` with FastAPI + WebSocket, task progress, resolution rate, ETA, and pause/resume/cancel controls
46
+ - **Interactive configuration wizard** (#74): Step-by-step CLI wizard for creating config files with presets (filesystem, web-search, database), model/benchmark selection, and MCP server setup
47
+ - **Dry-run mode** (#84): Preview evaluation plan without executing — shows tasks, estimated cost/time, validates config, checks Docker and MCP server availability
48
+ - **Task prioritization and scheduling** (#92): Intelligent task ordering with speed-first, cost-first, coverage-first, and custom scoring strategies
49
+ - **Color and formatting options** (#105): Configurable output themes (default, minimal, plain) with NO_COLOR convention support and MCPBR_THEME env var
50
+ - **Docker image pre-warming** (#128): Pre-pull Docker images in parallel before evaluation starts with progress reporting and cache detection
51
+ - **Result streaming to external storage** (#131): Stream results as tasks complete to local JSONL files, S3-compatible storage, or webhooks with buffering and retry
52
+ - **Memory-efficient large dataset handling** (#134): Streaming and chunked loading of large HuggingFace datasets with memory monitoring and automatic chunk-size adaptation
53
+ - **Task batching with smart scheduling** (#137): Group similar tasks by repo/image/category to minimize Docker container restarts with adaptive batch sizing
54
+ - **Resource limit configuration** (#139): Configure CPU, memory, disk, PID, and network limits for Docker containers with monitoring and violation reporting
55
+ - **Configuration migration tool** (#195): Detect and migrate old config formats (V1→V4) with dry-run preview, backup, and chained migration steps
56
+ - **Docker image caching optimization** (#228): LRU cache management with size limits, usage tracking, eviction, warmup recommendations, and dangling image cleanup
57
+
58
+ ### Fixed
59
+
60
+ - **Zero-cost metrics on evaluation timeout** (#374): Agent metrics (cost, tokens, iterations) were discarded when `benchmark.evaluate()` timed out after the agent had already completed successfully. Now preserves agent results when available.
61
+ - **Process hang after evaluation completes** (#374): `asyncio.run()` blocked indefinitely during cleanup because Docker SDK urllib3 background threads kept the default executor alive. Now force-shuts down the executor with `wait=False`.
62
+
10
63
  ## [0.4.16] - 2026-02-05
11
64
 
12
65
  ### Added
@@ -729,6 +782,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
729
782
  [0.3.14]: https://github.com/greynewell/mcpbr/releases/tag/v0.3.14
730
783
  [0.3.13]: https://github.com/greynewell/mcpbr/releases/tag/v0.3.13
731
784
  [0.3.12]: https://github.com/greynewell/mcpbr/releases/tag/v0.3.12
785
+ [0.6.0]: https://github.com/greynewell/mcpbr/releases/tag/v0.6.0
786
+ [0.5.0]: https://github.com/greynewell/mcpbr/releases/tag/v0.5.0
732
787
  [0.4.16]: https://github.com/greynewell/mcpbr/releases/tag/v0.4.16
733
788
  [0.3.11]: https://github.com/greynewell/mcpbr/releases/tag/v0.3.11
734
789
  [0.3.10]: https://github.com/greynewell/mcpbr/releases/tag/v0.3.10
@@ -0,0 +1,61 @@
1
+ # Multi-stage Dockerfile for running mcpbr CLI
2
+ # This is NOT the task environment Dockerfile - see Dockerfile for that.
3
+ # This image packages the mcpbr benchmark runner itself.
4
+
5
+ # Stage 1: Build
6
+ FROM python:3.11-slim AS builder
7
+
8
+ WORKDIR /build
9
+
10
+ # Install build dependencies
11
+ RUN pip install --no-cache-dir build hatchling
12
+
13
+ # Copy project files
14
+ COPY pyproject.toml README.md LICENSE ./
15
+ COPY src/ src/
16
+
17
+ # Build the wheel
18
+ RUN python -m build --wheel --outdir /build/dist
19
+
20
+ # Stage 2: Runtime
21
+ FROM python:3.11-slim
22
+
23
+ LABEL maintainer="mcpbr Contributors"
24
+ LABEL org.opencontainers.image.source="https://github.com/greynewell/mcpbr"
25
+ LABEL org.opencontainers.image.description="MCP Benchmark Runner - evaluate MCP servers against software engineering benchmarks"
26
+ LABEL org.opencontainers.image.licenses="MIT"
27
+
28
+ # Install runtime system dependencies
29
+ RUN apt-get update && apt-get install -y --no-install-recommends \
30
+ git \
31
+ curl \
32
+ ca-certificates \
33
+ && rm -rf /var/lib/apt/lists/*
34
+
35
+ # Create non-root user
36
+ RUN groupadd --gid 1000 mcpbr && \
37
+ useradd --uid 1000 --gid mcpbr --shell /bin/bash --create-home mcpbr
38
+
39
+ WORKDIR /home/mcpbr
40
+
41
+ # Install the built wheel from the builder stage
42
+ COPY --from=builder /build/dist/*.whl /tmp/
43
+ RUN pip install --no-cache-dir /tmp/*.whl && \
44
+ rm -f /tmp/*.whl
45
+
46
+ # Copy entrypoint
47
+ COPY docker/entrypoint.sh /usr/local/bin/entrypoint.sh
48
+ RUN chmod +x /usr/local/bin/entrypoint.sh
49
+
50
+ # Create directories for configs and results
51
+ RUN mkdir -p /home/mcpbr/configs /home/mcpbr/results && \
52
+ chown -R mcpbr:mcpbr /home/mcpbr
53
+
54
+ # Switch to non-root user
55
+ USER mcpbr
56
+
57
+ # Mount points for user configs and results
58
+ VOLUME ["/home/mcpbr/configs", "/home/mcpbr/results"]
59
+
60
+ ENTRYPOINT ["entrypoint.sh"]
61
+ CMD ["--help"]
@@ -0,0 +1,51 @@
1
+ class Mcpbr < Formula
2
+ include Language::Python::Virtualenv
3
+
4
+ desc "Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks"
5
+ homepage "https://github.com/greynewell/mcpbr"
6
+ # NOTE: Update URL and sha256 when publishing a release.
7
+ # Run: curl -sL <url> | shasum -a 256
8
+ url "https://files.pythonhosted.org/packages/source/m/mcpbr/mcpbr-0.6.0.tar.gz"
9
+ sha256 "PLACEHOLDER_SHA256_REPLACE_ON_RELEASE"
10
+ license "MIT"
11
+
12
+ depends_on "python@3.11"
13
+
14
+ resource "anthropic" do
15
+ url "https://files.pythonhosted.org/packages/source/a/anthropic/anthropic-0.40.0.tar.gz"
16
+ sha256 "PLACEHOLDER_SHA256"
17
+ end
18
+
19
+ resource "click" do
20
+ url "https://files.pythonhosted.org/packages/source/c/click/click-8.1.7.tar.gz"
21
+ sha256 "PLACEHOLDER_SHA256"
22
+ end
23
+
24
+ resource "docker" do
25
+ url "https://files.pythonhosted.org/packages/source/d/docker/docker-7.0.0.tar.gz"
26
+ sha256 "PLACEHOLDER_SHA256"
27
+ end
28
+
29
+ resource "pydantic" do
30
+ url "https://files.pythonhosted.org/packages/source/p/pydantic/pydantic-2.0.0.tar.gz"
31
+ sha256 "PLACEHOLDER_SHA256"
32
+ end
33
+
34
+ resource "pyyaml" do
35
+ url "https://files.pythonhosted.org/packages/source/p/PyYAML/PyYAML-6.0.1.tar.gz"
36
+ sha256 "PLACEHOLDER_SHA256"
37
+ end
38
+
39
+ resource "rich" do
40
+ url "https://files.pythonhosted.org/packages/source/r/rich/rich-13.0.0.tar.gz"
41
+ sha256 "PLACEHOLDER_SHA256"
42
+ end
43
+
44
+ def install
45
+ virtualenv_install_with_resources
46
+ end
47
+
48
+ test do
49
+ assert_match "mcpbr", shell_output("#{bin}/mcpbr --version")
50
+ end
51
+ end
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcpbr
3
- Version: 0.4.16
3
+ Version: 0.6.0
4
4
  Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
5
5
  Project-URL: Homepage, https://github.com/greynewell/mcpbr
6
6
  Project-URL: Repository, https://github.com/greynewell/mcpbr
@@ -30,6 +30,9 @@ Requires-Dist: pydantic>=2.0.0
30
30
  Requires-Dist: pyyaml>=6.0.0
31
31
  Requires-Dist: requests>=2.31.0
32
32
  Requires-Dist: rich>=13.0.0
33
+ Provides-Extra: all-providers
34
+ Requires-Dist: google-generativeai>=0.3.0; extra == 'all-providers'
35
+ Requires-Dist: openai>=1.0.0; extra == 'all-providers'
33
36
  Provides-Extra: dev
34
37
  Requires-Dist: pre-commit>=3.0.0; extra == 'dev'
35
38
  Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
@@ -40,6 +43,10 @@ Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
40
43
  Requires-Dist: mkdocs-minify-plugin>=0.7.0; extra == 'docs'
41
44
  Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
42
45
  Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
46
+ Provides-Extra: gemini
47
+ Requires-Dist: google-generativeai>=0.3.0; extra == 'gemini'
48
+ Provides-Extra: openai
49
+ Requires-Dist: openai>=1.0.0; extra == 'openai'
43
50
  Description-Content-Type: text/markdown
44
51
 
45
52
  # mcpbr
@@ -0,0 +1,46 @@
1
+ # mcpbr GitHub Action
2
+
3
+ A composite GitHub Action to run MCP server benchmarks in your CI/CD pipeline.
4
+
5
+ ## Usage
6
+
7
+ ```yaml
8
+ - uses: greynewell/mcpbr@main
9
+ with:
10
+ config: benchmarks/config.yaml
11
+ anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
12
+ ```
13
+
14
+ ## Inputs
15
+
16
+ | Input | Description | Required | Default |
17
+ |---|---|---|---|
18
+ | `config` | Path to benchmark configuration YAML | Yes | - |
19
+ | `version` | mcpbr version to install | No | `latest` |
20
+ | `python-version` | Python version to use | No | `3.11` |
21
+ | `output-dir` | Directory for results | No | `results` |
22
+ | `extra-args` | Additional CLI arguments | No | `""` |
23
+ | `anthropic-api-key` | Anthropic API key | No | - |
24
+ | `openai-api-key` | OpenAI API key | No | - |
25
+
26
+ ## Outputs
27
+
28
+ | Output | Description |
29
+ |---|---|
30
+ | `results-path` | Path to the results directory |
31
+ | `exit-code` | Exit code from the benchmark run |
32
+
33
+ ## Examples
34
+
35
+ See the [examples](examples/) directory for:
36
+
37
+ - [basic.yml](examples/basic.yml) - Simple benchmark run on push
38
+ - [matrix.yml](examples/matrix.yml) - Matrix builds across benchmarks and Python versions
39
+
40
+ ## Security
41
+
42
+ Always use GitHub Secrets for API keys. Never hardcode them in workflow files.
43
+
44
+ ```yaml
45
+ anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
46
+ ```
@@ -0,0 +1,90 @@
1
+ name: "mcpbr Benchmark Runner"
2
+ description: "Run MCP server benchmarks in your CI/CD pipeline using mcpbr"
3
+ author: "mcpbr Contributors"
4
+
5
+ branding:
6
+ icon: "bar-chart-2"
7
+ color: "blue"
8
+
9
+ inputs:
10
+ config:
11
+ description: "Path to benchmark configuration YAML file"
12
+ required: true
13
+ version:
14
+ description: "Version of mcpbr to install (e.g., '0.6.0' or 'latest')"
15
+ required: false
16
+ default: "latest"
17
+ python-version:
18
+ description: "Python version to use"
19
+ required: false
20
+ default: "3.11"
21
+ output-dir:
22
+ description: "Directory for benchmark results"
23
+ required: false
24
+ default: "results"
25
+ extra-args:
26
+ description: "Additional arguments to pass to mcpbr run"
27
+ required: false
28
+ default: ""
29
+ anthropic-api-key:
30
+ description: "Anthropic API key (prefer using secrets)"
31
+ required: false
32
+ openai-api-key:
33
+ description: "OpenAI API key (prefer using secrets)"
34
+ required: false
35
+
36
+ outputs:
37
+ results-path:
38
+ description: "Path to the results directory"
39
+ value: ${{ steps.run-benchmark.outputs.results-path }}
40
+ exit-code:
41
+ description: "Exit code from mcpbr run"
42
+ value: ${{ steps.run-benchmark.outputs.exit-code }}
43
+
44
+ runs:
45
+ using: "composite"
46
+ steps:
47
+ - name: Set up Python
48
+ uses: actions/setup-python@v5
49
+ with:
50
+ python-version: ${{ inputs.python-version }}
51
+
52
+ - name: Install mcpbr
53
+ shell: bash
54
+ run: |
55
+ python -m pip install --upgrade pip
56
+ if [ "${{ inputs.version }}" = "latest" ]; then
57
+ pip install mcpbr
58
+ else
59
+ pip install "mcpbr==${{ inputs.version }}"
60
+ fi
61
+
62
+ - name: Create output directory
63
+ shell: bash
64
+ run: mkdir -p "${{ inputs.output-dir }}"
65
+
66
+ - name: Run benchmarks
67
+ id: run-benchmark
68
+ shell: bash
69
+ env:
70
+ ANTHROPIC_API_KEY: ${{ inputs.anthropic-api-key }}
71
+ OPENAI_API_KEY: ${{ inputs.openai-api-key }}
72
+ run: |
73
+ set +e
74
+ mcpbr run \
75
+ --config "${{ inputs.config }}" \
76
+ --output-dir "${{ inputs.output-dir }}" \
77
+ ${{ inputs.extra-args }}
78
+ EXIT_CODE=$?
79
+ set -e
80
+ echo "exit-code=${EXIT_CODE}" >> "$GITHUB_OUTPUT"
81
+ echo "results-path=${{ inputs.output-dir }}" >> "$GITHUB_OUTPUT"
82
+ exit ${EXIT_CODE}
83
+
84
+ - name: Upload results artifact
85
+ if: always()
86
+ uses: actions/upload-artifact@v4
87
+ with:
88
+ name: mcpbr-results
89
+ path: ${{ inputs.output-dir }}
90
+ retention-days: 30
@@ -0,0 +1,25 @@
1
+ # Basic example: Run mcpbr benchmarks on push to main
2
+ name: MCP Benchmark
3
+
4
+ on:
5
+ push:
6
+ branches: [main]
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ benchmark:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+
15
+ - name: Run MCP benchmarks
16
+ uses: greynewell/mcpbr@main
17
+ with:
18
+ config: benchmarks/config.yaml
19
+ anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
20
+ output-dir: results
21
+
22
+ - name: Check results
23
+ run: |
24
+ echo "Benchmark results saved to results/"
25
+ ls -la results/
@@ -0,0 +1,62 @@
1
+ # Matrix example: Run benchmarks across multiple configurations
2
+ name: MCP Benchmark Matrix
3
+
4
+ on:
5
+ pull_request:
6
+ branches: [main]
7
+ schedule:
8
+ - cron: "0 6 * * 1" # Weekly on Monday at 6am UTC
9
+
10
+ jobs:
11
+ benchmark:
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ fail-fast: false
15
+ matrix:
16
+ benchmark:
17
+ - swebench
18
+ - humaneval
19
+ - mbpp
20
+ python-version:
21
+ - "3.11"
22
+ - "3.12"
23
+
24
+ steps:
25
+ - uses: actions/checkout@v4
26
+
27
+ - name: Run ${{ matrix.benchmark }} benchmark
28
+ uses: greynewell/mcpbr@main
29
+ with:
30
+ config: benchmarks/${{ matrix.benchmark }}.yaml
31
+ python-version: ${{ matrix.python-version }}
32
+ anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
33
+ output-dir: results-${{ matrix.benchmark }}-py${{ matrix.python-version }}
34
+
35
+ - name: Upload individual results
36
+ uses: actions/upload-artifact@v4
37
+ with:
38
+ name: results-${{ matrix.benchmark }}-py${{ matrix.python-version }}
39
+ path: results-${{ matrix.benchmark }}-py${{ matrix.python-version }}
40
+
41
+ aggregate:
42
+ needs: benchmark
43
+ runs-on: ubuntu-latest
44
+ if: always()
45
+ steps:
46
+ - name: Download all results
47
+ uses: actions/download-artifact@v4
48
+ with:
49
+ path: all-results
50
+
51
+ - name: Summary
52
+ run: |
53
+ echo "## Benchmark Results" >> $GITHUB_STEP_SUMMARY
54
+ for dir in all-results/results-*; do
55
+ benchmark=$(basename "$dir")
56
+ echo "### ${benchmark}" >> $GITHUB_STEP_SUMMARY
57
+ for json_file in "${dir}"/*.json; do
58
+ if [ -f "$json_file" ]; then
59
+ cat "$json_file" >> $GITHUB_STEP_SUMMARY
60
+ fi
61
+ done
62
+ done