mcpbr 0.4.13__tar.gz → 0.4.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/marketplace.json +2 -2
  2. {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/package.json +1 -1
  3. {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/plugin.json +1 -1
  4. {mcpbr-0.4.13 → mcpbr-0.4.14}/CHANGELOG.md +2 -0
  5. {mcpbr-0.4.13 → mcpbr-0.4.14}/PKG-INFO +1 -1
  6. {mcpbr-0.4.13 → mcpbr-0.4.14}/package.json +1 -1
  7. {mcpbr-0.4.13 → mcpbr-0.4.14}/pyproject.toml +1 -1
  8. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/config.py +22 -0
  9. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/docker_env.py +15 -4
  10. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/evaluation.py +19 -2
  11. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/harness.py +4 -1
  12. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/harnesses.py +29 -0
  13. mcpbr-0.4.14/src/mcpbr/swebench_test_specs.py +33 -0
  14. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_azure.py +82 -36
  15. mcpbr-0.4.13/tests/test_django_runner.py → mcpbr-0.4.14/tests/test_build_test_command.py +73 -1
  16. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_trial_mode.py +6 -0
  17. {mcpbr-0.4.13 → mcpbr-0.4.14}/uv.lock +1 -1
  18. {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude/settings.json +0 -0
  19. {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/README.md +0 -0
  20. {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/skills/README.md +0 -0
  21. {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/skills/benchmark-swe-lite/SKILL.md +0 -0
  22. {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/skills/mcpbr-config/SKILL.md +0 -0
  23. {mcpbr-0.4.13 → mcpbr-0.4.14}/.claude-plugin/skills/mcpbr-eval/SKILL.md +0 -0
  24. {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  25. {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  26. {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  27. {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  28. {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/dependabot.yml +0 -0
  29. {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/release-drafter.yml +0 -0
  30. {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/workflows/ci.yml +0 -0
  31. {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/workflows/post-release-bump.yml +0 -0
  32. {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/workflows/publish-npm.yml +0 -0
  33. {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/workflows/publish.yml +0 -0
  34. {mcpbr-0.4.13 → mcpbr-0.4.14}/.github/workflows/release-drafter.yml +0 -0
  35. {mcpbr-0.4.13 → mcpbr-0.4.14}/.gitignore +0 -0
  36. {mcpbr-0.4.13 → mcpbr-0.4.14}/.pre-commit-config.yaml +0 -0
  37. {mcpbr-0.4.13 → mcpbr-0.4.14}/AGENTS.md +0 -0
  38. {mcpbr-0.4.13 → mcpbr-0.4.14}/CLAUDE.md +0 -0
  39. {mcpbr-0.4.13 → mcpbr-0.4.14}/CODE_OF_CONDUCT.md +0 -0
  40. {mcpbr-0.4.13 → mcpbr-0.4.14}/CONTRIBUTING.md +0 -0
  41. {mcpbr-0.4.13 → mcpbr-0.4.14}/Dockerfile +0 -0
  42. {mcpbr-0.4.13 → mcpbr-0.4.14}/HUMANEVAL_FIX_SUMMARY.md +0 -0
  43. {mcpbr-0.4.13 → mcpbr-0.4.14}/LICENSE +0 -0
  44. {mcpbr-0.4.13 → mcpbr-0.4.14}/Makefile +0 -0
  45. {mcpbr-0.4.13 → mcpbr-0.4.14}/PR_SUMMARY.md +0 -0
  46. {mcpbr-0.4.13 → mcpbr-0.4.14}/README.md +0 -0
  47. {mcpbr-0.4.13 → mcpbr-0.4.14}/SECURITY.md +0 -0
  48. {mcpbr-0.4.13 → mcpbr-0.4.14}/assets/mcpbr-demo.gif +0 -0
  49. {mcpbr-0.4.13 → mcpbr-0.4.14}/assets/mcpbr-eval-results.png +0 -0
  50. {mcpbr-0.4.13 → mcpbr-0.4.14}/assets/mcpbr-logo.jpg +0 -0
  51. {mcpbr-0.4.13 → mcpbr-0.4.14}/bin/mcpbr.js +0 -0
  52. {mcpbr-0.4.13 → mcpbr-0.4.14}/config/example.yaml +0 -0
  53. {mcpbr-0.4.13 → mcpbr-0.4.14}/config/humaneval.yaml +0 -0
  54. {mcpbr-0.4.13 → mcpbr-0.4.14}/config/supermodel.yaml +0 -0
  55. {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/azure-config-example.yaml +0 -0
  56. {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/env-vars-example.yaml +0 -0
  57. {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/README.md +0 -0
  58. {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/base-config.yaml +0 -0
  59. {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/dev-config.yaml +0 -0
  60. {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/multi-extend-config.yaml +0 -0
  61. {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/production-config.yaml +0 -0
  62. {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/inheritance/shared-mcp-settings.yaml +0 -0
  63. {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/local-config-example.yaml +0 -0
  64. {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/quick-start/gsm8k-math-reasoning.yaml +0 -0
  65. {mcpbr-0.4.13 → mcpbr-0.4.14}/examples/quick-start/test-your-mcp-server.yaml +0 -0
  66. {mcpbr-0.4.13 → mcpbr-0.4.14}/install.sh +0 -0
  67. {mcpbr-0.4.13 → mcpbr-0.4.14}/requirements.txt +0 -0
  68. {mcpbr-0.4.13 → mcpbr-0.4.14}/scripts/sync_version.py +0 -0
  69. {mcpbr-0.4.13 → mcpbr-0.4.14}/scripts/validate_plugin_manifests.py +0 -0
  70. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/__init__.py +0 -0
  71. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/__main__.py +0 -0
  72. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/agent.py +0 -0
  73. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/__init__.py +0 -0
  74. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/agentbench.py +0 -0
  75. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/aider_polyglot.py +0 -0
  76. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/apps.py +0 -0
  77. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/arc.py +0 -0
  78. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/base.py +0 -0
  79. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/bigbench_hard.py +0 -0
  80. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/bigcodebench.py +0 -0
  81. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/codecontests.py +0 -0
  82. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/codereval.py +0 -0
  83. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/cybergym.py +0 -0
  84. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/gaia.py +0 -0
  85. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/gsm8k.py +0 -0
  86. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/hellaswag.py +0 -0
  87. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/humaneval.py +0 -0
  88. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/intercode.py +0 -0
  89. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/leetcode.py +0 -0
  90. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/math_benchmark.py +0 -0
  91. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/mbpp.py +0 -0
  92. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/mcptoolbench.py +0 -0
  93. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/mlagentbench.py +0 -0
  94. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/repoqa.py +0 -0
  95. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/swebench.py +0 -0
  96. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/terminalbench.py +0 -0
  97. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/toolbench.py +0 -0
  98. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/truthfulqa.py +0 -0
  99. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/benchmarks/webarena.py +0 -0
  100. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/cache.py +0 -0
  101. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/cli.py +0 -0
  102. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/config_inheritance.py +0 -0
  103. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/config_validator.py +0 -0
  104. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/brave-search.yaml +0 -0
  105. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/filesystem.yaml +0 -0
  106. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/github.yaml +0 -0
  107. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/google-maps.yaml +0 -0
  108. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/postgres.yaml +0 -0
  109. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/slack.yaml +0 -0
  110. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/data/templates/sqlite.yaml +0 -0
  111. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/env_expansion.py +0 -0
  112. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/incremental_save.py +0 -0
  113. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/__init__.py +0 -0
  114. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/azure.py +0 -0
  115. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/azure_health.py +0 -0
  116. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/base.py +0 -0
  117. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/local.py +0 -0
  118. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/infrastructure/manager.py +0 -0
  119. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/junit_reporter.py +0 -0
  120. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/log_formatter.py +0 -0
  121. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/models.py +0 -0
  122. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/output_validator.py +0 -0
  123. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/preflight.py +0 -0
  124. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/pricing.py +0 -0
  125. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/profiler.py +0 -0
  126. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/providers.py +0 -0
  127. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/regression.py +0 -0
  128. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/reporting.py +0 -0
  129. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/schema.py +0 -0
  130. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/smoke_test.py +0 -0
  131. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/state_tracker.py +0 -0
  132. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/statistics.py +0 -0
  133. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/streaming.py +0 -0
  134. {mcpbr-0.4.13 → mcpbr-0.4.14}/src/mcpbr/templates.py +0 -0
  135. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/__init__.py +0 -0
  136. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/__init__.py +0 -0
  137. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_azure_health.py +0 -0
  138. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_base.py +0 -0
  139. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_cli_infrastructure.py +0 -0
  140. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_config.py +0 -0
  141. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_local.py +0 -0
  142. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/infrastructure/test_manager.py +0 -0
  143. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_agent.py +0 -0
  144. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_benchmark_filtering.py +0 -0
  145. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_benchmark_integration.py +0 -0
  146. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_benchmarks.py +0 -0
  147. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_cache.py +0 -0
  148. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_claude_plugin.py +0 -0
  149. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_cli_templates.py +0 -0
  150. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_comparison_aggregation.py +0 -0
  151. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_comparison_config.py +0 -0
  152. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_comparison_integration.py +0 -0
  153. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_comparison_reporting.py +0 -0
  154. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_config.py +0 -0
  155. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_config_env_vars.py +0 -0
  156. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_config_inheritance.py +0 -0
  157. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_config_validator.py +0 -0
  158. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_config_validator_inheritance.py +0 -0
  159. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_cost_calculation.py +0 -0
  160. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_default_logging.py +0 -0
  161. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_docker_cleanup.py +0 -0
  162. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_docker_label_fix.py +0 -0
  163. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_docker_retry.py +0 -0
  164. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_env_expansion.py +0 -0
  165. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_error_messages.py +0 -0
  166. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_evaluation.py +0 -0
  167. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_exit_codes.py +0 -0
  168. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_export.py +0 -0
  169. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_git_diff_new_files.py +0 -0
  170. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_incremental_save.py +0 -0
  171. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_integration.py +0 -0
  172. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_junit_reporter.py +0 -0
  173. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_log_formatter_read_tool.py +0 -0
  174. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_mcp_health_check.py +0 -0
  175. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_mcp_logging.py +0 -0
  176. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_models.py +0 -0
  177. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_output_validator.py +0 -0
  178. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_parse_errors.py +0 -0
  179. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_preflight.py +0 -0
  180. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_pricing.py +0 -0
  181. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_profiler.py +0 -0
  182. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_regression.py +0 -0
  183. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_reporting.py +0 -0
  184. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_runtime_tracking.py +0 -0
  185. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_schema.py +0 -0
  186. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_smoke_test.py +0 -0
  187. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_state_tracker.py +0 -0
  188. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_statistics.py +0 -0
  189. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_statistics_integration.py +0 -0
  190. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_streaming.py +0 -0
  191. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_string_concat_bug.py +0 -0
  192. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_templates.py +0 -0
  193. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_thinking_budget.py +0 -0
  194. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_timeout_tracking.py +0 -0
  195. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_tool_failure_tracking.py +0 -0
  196. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_type_safety.py +0 -0
  197. {mcpbr-0.4.13 → mcpbr-0.4.14}/tests/test_xml_export.py +0 -0
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "$schema": "https://anthropic.com/claude-code/marketplace.schema.json",
3
3
  "name": "mcpbr",
4
- "version": "0.4.13",
4
+ "version": "0.4.14",
5
5
  "description": "mcpbr - MCP Benchmark Runner plugin marketplace",
6
6
  "owner": {
7
7
  "name": "mcpbr Contributors",
@@ -11,7 +11,7 @@
11
11
  {
12
12
  "name": "mcpbr",
13
13
  "description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
14
- "version": "0.4.13",
14
+ "version": "0.4.14",
15
15
  "author": {
16
16
  "name": "mcpbr Contributors"
17
17
  },
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@greynewell/mcpbr-claude-plugin",
3
- "version": "0.4.13",
3
+ "version": "0.4.14",
4
4
  "description": "Claude Code plugin for mcpbr - Expert benchmark runner for MCP servers with specialized skills",
5
5
  "keywords": [
6
6
  "claude-code",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mcpbr",
3
- "version": "0.4.13",
3
+ "version": "0.4.14",
4
4
  "description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
5
5
  "schema_version": "1.0"
6
6
  }
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
13
13
 
14
14
  ### Fixed
15
15
 
16
+ - **Repository-aware test commands for non-pytest projects** (#365): Use upstream SWE-bench test command specs for sympy (`bin/test`), sphinx (`tox`), and other non-pytest repos instead of defaulting to `python -m pytest`
17
+ - **Flaky Azure and trial mode tests**: Fixed tests that depended on local `~/.ssh/mcpbr_azure` state and updated assertions for multi-step dependency installation
16
18
  - **SEO improvements** for documentation site
17
19
  - Added robots.txt with sitemap reference
18
20
  - Added Open Graph and Twitter Card meta tags on all pages
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcpbr
3
- Version: 0.4.13
3
+ Version: 0.4.14
4
4
  Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
5
5
  Project-URL: Homepage, https://github.com/greynewell/mcpbr
6
6
  Project-URL: Repository, https://github.com/greynewell/mcpbr
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@greynewell/mcpbr",
3
- "version": "0.4.13",
3
+ "version": "0.4.14",
4
4
  "description": "Model Context Protocol Benchmark Runner - CLI tool for evaluating MCP servers",
5
5
  "keywords": [
6
6
  "mcpbr",
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "mcpbr"
7
- version = "0.4.13"
7
+ version = "0.4.14"
8
8
  description = "Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -109,6 +109,16 @@ class MCPServerConfig(BaseModel):
109
109
  default=900000,
110
110
  description="Timeout in milliseconds for MCP tool execution (default: 15 min for long-running tools)",
111
111
  )
112
+ setup_command: str | None = Field(
113
+ default=None,
114
+ description="Shell command to run inside the container BEFORE the agent starts. "
115
+ "Runs outside the task timer (does not count against timeout_seconds). "
116
+ "Use {workdir} as placeholder. Useful for pre-computing caches.",
117
+ )
118
+ setup_timeout_ms: int = Field(
119
+ default=900000,
120
+ description="Timeout in milliseconds for the setup_command (default: 15 min)",
121
+ )
112
122
 
113
123
  def get_args_for_workdir(self, workdir: str) -> list[str]:
114
124
  """Replace {workdir} placeholder in args with actual path."""
@@ -117,6 +127,12 @@ class MCPServerConfig(BaseModel):
117
127
  result.append(arg.replace("{workdir}", workdir))
118
128
  return result
119
129
 
130
+ def get_setup_command_for_workdir(self, workdir: str) -> str | None:
131
+ """Replace {workdir} placeholder in setup_command with actual path."""
132
+ if self.setup_command is None:
133
+ return None
134
+ return self.setup_command.replace("{workdir}", workdir)
135
+
120
136
  def get_expanded_env(self) -> dict[str, str]:
121
137
  """Expand ${VAR} references in env values using os.environ.
122
138
 
@@ -400,6 +416,12 @@ class HarnessConfig(BaseModel):
400
416
  description="Enable comprehensive performance profiling (tool latency, memory, overhead)",
401
417
  )
402
418
 
419
+ volumes: dict[str, str] = Field(
420
+ default_factory=dict,
421
+ description="Additional volume mounts (read-write) for Docker containers (host_path: container_path). "
422
+ "Mounted into every container, persists across tasks. Useful for pre-computed caches.",
423
+ )
424
+
403
425
  infrastructure: InfrastructureConfig = Field(
404
426
  default_factory=InfrastructureConfig,
405
427
  description="Infrastructure configuration (local or azure)",
@@ -314,14 +314,18 @@ class DockerEnvironmentManager:
314
314
  FALLBACK_IMAGE = "mcpbr-env"
315
315
  DOCKERFILE_PATH = Path(__file__).parent.parent.parent / "Dockerfile"
316
316
 
317
- def __init__(self, use_prebuilt: bool = True) -> None:
317
+ def __init__(
318
+ self, use_prebuilt: bool = True, extra_volumes: dict[str, str] | None = None
319
+ ) -> None:
318
320
  """Initialize the Docker environment manager.
319
321
 
320
322
  Args:
321
323
  use_prebuilt: If True, try to use pre-built SWE-bench images first.
324
+ extra_volumes: Additional volume mounts (read-write) (host_path -> container_path).
322
325
  """
323
326
  self.client = docker.from_env()
324
327
  self.use_prebuilt = use_prebuilt
328
+ self._extra_volumes = extra_volumes or {}
325
329
  self._fallback_image_built = False
326
330
  self._temp_dirs: list[tempfile.TemporaryDirectory[str]] = []
327
331
  self._containers: list[Container] = []
@@ -488,6 +492,15 @@ CMD ["/bin/bash"]
488
492
 
489
493
  for attempt in range(max_retries + 1):
490
494
  try:
495
+ volumes_dict: dict[str, dict[str, str]] = {
496
+ host_workdir: {"bind": "/workspace", "mode": "rw"},
497
+ }
498
+ for host_path, container_path in self._extra_volumes.items():
499
+ volumes_dict[os.path.abspath(host_path)] = {
500
+ "bind": container_path,
501
+ "mode": "rw",
502
+ }
503
+
491
504
  container = self.client.containers.run(
492
505
  image_name,
493
506
  command="tail -f /dev/null",
@@ -495,9 +508,7 @@ CMD ["/bin/bash"]
495
508
  detach=True,
496
509
  platform="linux/amd64" if uses_prebuilt else None,
497
510
  network_mode="bridge", # Enable network for API calls
498
- volumes={
499
- host_workdir: {"bind": "/workspace", "mode": "rw"},
500
- },
511
+ volumes=volumes_dict,
501
512
  working_dir=container_workdir,
502
513
  remove=False,
503
514
  labels={
@@ -137,6 +137,7 @@ async def run_tests(
137
137
  timeout: int = 120,
138
138
  uses_prebuilt: bool = False,
139
139
  workdir: str | None = None,
140
+ repo: str | None = None,
140
141
  ) -> TestResults:
141
142
  """Run a list of tests and return results.
142
143
 
@@ -146,6 +147,7 @@ async def run_tests(
146
147
  timeout: Timeout per test in seconds.
147
148
  uses_prebuilt: Whether a pre-built SWE-bench image is being used.
148
149
  workdir: Working directory to run tests from. Defaults to env.workdir.
150
+ repo: Repository identifier for looking up the correct test runner.
149
151
 
150
152
  Returns:
151
153
  TestResults with pass/fail counts.
@@ -157,7 +159,7 @@ async def run_tests(
157
159
  passed = 0
158
160
 
159
161
  for test in tests:
160
- test_cmd = _build_test_command(test, uses_prebuilt)
162
+ test_cmd = _build_test_command(test, uses_prebuilt, repo=repo)
161
163
 
162
164
  try:
163
165
  exit_code, stdout, stderr = await env.exec_command(
@@ -198,7 +200,7 @@ async def run_tests(
198
200
  )
199
201
 
200
202
 
201
- def _build_test_command(test: str, uses_prebuilt: bool = False) -> str:
203
+ def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None = None) -> str:
202
204
  """Build a test command for the given test identifier.
203
205
 
204
206
  Args:
@@ -206,18 +208,29 @@ def _build_test_command(test: str, uses_prebuilt: bool = False) -> str:
206
208
  - pytest: "tests/test_file.py::test_func" or "tests/test_file.py"
207
209
  - Django: "test_method (module.TestClass)" or "module.tests.TestClass.test_method"
208
210
  uses_prebuilt: If True, activate the testbed conda environment first.
211
+ repo: Repository identifier (e.g., "sympy/sympy") for looking up
212
+ the correct test runner from upstream SWE-bench specs.
209
213
 
210
214
  Returns:
211
215
  Shell command string to run the test.
212
216
  """
213
217
  import re
214
218
 
219
+ from .swebench_test_specs import get_repo_test_command
220
+
215
221
  # Pre-built SWE-bench images use a conda environment called 'testbed'
216
222
  if uses_prebuilt:
217
223
  activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && "
218
224
  else:
219
225
  activate = ""
220
226
 
227
+ # Check upstream SWE-bench test command mapping for non-pytest runners
228
+ if repo:
229
+ upstream_cmd = get_repo_test_command(repo)
230
+ if upstream_cmd and "runtests.py" not in upstream_cmd and "pytest" not in upstream_cmd:
231
+ # Non-pytest, non-Django project (e.g., sympy uses bin/test)
232
+ return f"{activate}{upstream_cmd} {test}"
233
+
221
234
  # Detect Django test format: "test_method (module.TestClass)"
222
235
  if "(" in test and ")" in test and "." in test:
223
236
  # Extract module path from parentheses
@@ -344,12 +357,15 @@ async def evaluate_patch(
344
357
  if not env.uses_prebuilt:
345
358
  await _install_dependencies(env)
346
359
 
360
+ repo = task.get("repo")
361
+
347
362
  fail_to_pass_results = await run_tests(
348
363
  env,
349
364
  fail_to_pass_tests,
350
365
  timeout=test_timeout,
351
366
  uses_prebuilt=env.uses_prebuilt,
352
367
  workdir=eval_workdir,
368
+ repo=repo,
353
369
  )
354
370
 
355
371
  pass_to_pass_results = await run_tests(
@@ -358,6 +374,7 @@ async def evaluate_patch(
358
374
  timeout=test_timeout,
359
375
  uses_prebuilt=env.uses_prebuilt,
360
376
  workdir=eval_workdir,
377
+ repo=repo,
361
378
  )
362
379
 
363
380
  resolved = (
@@ -962,7 +962,10 @@ async def run_evaluation(
962
962
  "args": config.mcp_server.args if config.mcp_server else [],
963
963
  }
964
964
 
965
- docker_manager = DockerEnvironmentManager(use_prebuilt=config.use_prebuilt_images)
965
+ docker_manager = DockerEnvironmentManager(
966
+ use_prebuilt=config.use_prebuilt_images,
967
+ extra_volumes=config.volumes,
968
+ )
966
969
 
967
970
  results: list[TaskResult] = []
968
971
  # Add cached results if using state tracker
@@ -895,6 +895,35 @@ class ClaudeCodeHarness:
895
895
  cost_usd=None,
896
896
  )
897
897
 
898
+ # Run setup_command if configured (BEFORE agent, OUTSIDE task timer).
899
+ # This is the right place for expensive one-time operations like
900
+ # pre-computing caches that should not count against timeout_seconds.
901
+ if self.mcp_server and self.mcp_server.setup_command:
902
+ setup_cmd = self.mcp_server.get_setup_command_for_workdir(env.workdir)
903
+ setup_timeout = int(self.mcp_server.setup_timeout_ms / 1000)
904
+
905
+ if verbose:
906
+ self._console.print(
907
+ f"[cyan]Running setup command (timeout: {setup_timeout:.0f}s)...[/cyan]"
908
+ )
909
+
910
+ setup_full_cmd = f"source {shlex.quote(env_file)} && {setup_cmd}"
911
+ setup_exit, _setup_stdout, setup_stderr = await env.exec_command(
912
+ ["/bin/bash", "-c", setup_full_cmd],
913
+ timeout=setup_timeout,
914
+ )
915
+
916
+ if setup_exit != 0:
917
+ if verbose:
918
+ self._console.print(
919
+ f"[yellow]⚠ Setup command exited with code {setup_exit}[/yellow]"
920
+ )
921
+ if setup_stderr:
922
+ self._console.print(f"[dim]{setup_stderr[:500]}[/dim]")
923
+ # Non-fatal: continue with agent even if setup fails
924
+ elif verbose:
925
+ self._console.print("[green]✓ Setup command completed[/green]")
926
+
898
927
  try:
899
928
  claude_args = [
900
929
  "--print",
@@ -0,0 +1,33 @@
1
+ """Test command specs from upstream SWE-bench harness.
2
+
3
+ Maps repositories to their correct test commands. mcpbr defaults to pytest
4
+ for all non-Django projects, but some projects (e.g., sympy) use custom test
5
+ runners that aren't pytest-compatible.
6
+
7
+ Source: https://github.com/SWE-bench/SWE-bench/blob/main/swebench/harness/constants/python.py
8
+ """
9
+
10
+ # Base test commands per framework (from upstream constants/python.py)
11
+ TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider"
12
+ TEST_DJANGO = "./tests/runtests.py --verbosity 2 --settings=test_sqlite --parallel 1"
13
+ TEST_SYMPY = "PYTHONWARNINGS='ignore::UserWarning,ignore::SyntaxWarning' bin/test -C --verbose"
14
+ TEST_SPHINX = "tox --current-env -epy39 -v --"
15
+ TEST_ASTROPY = "pytest -rA -vv -o console_output_style=classic --tb=no"
16
+ TEST_SEABORN = "pytest --no-header -rA"
17
+
18
+ # Repo → test command mapping
19
+ # Only non-pytest entries need to be here — pytest is the default fallback.
20
+ # Django is included for documentation but its existing handler takes precedence.
21
+ REPO_TO_TEST_CMD: dict[str, str] = {
22
+ "sympy/sympy": TEST_SYMPY,
23
+ "django/django": TEST_DJANGO,
24
+ "sphinx-doc/sphinx": TEST_SPHINX,
25
+ }
26
+
27
+
28
+ def get_repo_test_command(repo: str) -> str | None:
29
+ """Look up the upstream test command for a repo.
30
+
31
+ Returns None if repo uses standard pytest (handled by existing logic).
32
+ """
33
+ return REPO_TO_TEST_CMD.get(repo)
@@ -155,11 +155,15 @@ class TestVMProvisioning:
155
155
  mock_time: MagicMock,
156
156
  mock_run: MagicMock,
157
157
  azure_provider: AzureProvider,
158
+ tmp_path: Path,
158
159
  ) -> None:
159
160
  """Test successful VM creation."""
160
- # Mock ssh-keygen, resource group show (exists), vm create
161
+ # Use existing SSH key to avoid depending on ~/.ssh/mcpbr_azure state
162
+ ssh_key = tmp_path / "test_key"
163
+ ssh_key.touch()
164
+ azure_provider.azure_config.ssh_key_path = ssh_key
165
+
161
166
  mock_run.side_effect = [
162
- Mock(returncode=0), # ssh-keygen
163
167
  Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show (exists)
164
168
  Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
165
169
  ]
@@ -179,11 +183,15 @@ class TestVMProvisioning:
179
183
  mock_time: MagicMock,
180
184
  mock_run: MagicMock,
181
185
  azure_provider: AzureProvider,
186
+ tmp_path: Path,
182
187
  ) -> None:
183
188
  """Test VM creation with resource group creation."""
184
- # Mock resource group doesn't exist, then create it
189
+ # Use existing SSH key to avoid depending on ~/.ssh/mcpbr_azure state
190
+ ssh_key = tmp_path / "test_key"
191
+ ssh_key.touch()
192
+ azure_provider.azure_config.ssh_key_path = ssh_key
193
+
185
194
  mock_run.side_effect = [
186
- Mock(returncode=0), # ssh-keygen
187
195
  Mock(returncode=1, stderr="ResourceGroupNotFound"), # az group show (not found)
188
196
  Mock(returncode=0), # az group create
189
197
  Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
@@ -198,16 +206,19 @@ class TestVMProvisioning:
198
206
  self,
199
207
  mock_run: MagicMock,
200
208
  azure_provider: AzureProvider,
209
+ tmp_path: Path,
201
210
  ) -> None:
202
211
  """Test VM creation with SSH key generation."""
203
- # Mock ssh-keygen, resource group show, and vm creation
204
- mock_run.side_effect = [
205
- Mock(returncode=0), # ssh-keygen
206
- Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
207
- Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
208
- ]
212
+ # Redirect Path.home to tmp_path so ~/.ssh/mcpbr_azure doesn't exist
213
+ with patch("mcpbr.infrastructure.azure.Path.home", return_value=tmp_path):
214
+ # Mock ssh-keygen, resource group show, and vm creation
215
+ mock_run.side_effect = [
216
+ Mock(returncode=0), # ssh-keygen
217
+ Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
218
+ Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
219
+ ]
209
220
 
210
- await azure_provider._create_vm("Standard_D8s_v3")
221
+ await azure_provider._create_vm("Standard_D8s_v3")
211
222
 
212
223
  # Verify ssh-keygen was called
213
224
  ssh_keygen_call = mock_run.call_args_list[0]
@@ -218,11 +229,15 @@ class TestVMProvisioning:
218
229
  self,
219
230
  mock_run: MagicMock,
220
231
  azure_provider: AzureProvider,
232
+ tmp_path: Path,
221
233
  ) -> None:
222
234
  """Test VM creation failure (quota exceeded)."""
223
- # Mock ssh-keygen success, resource group show, VM creation failure
235
+ # Use existing SSH key to avoid depending on ~/.ssh/mcpbr_azure state
236
+ ssh_key = tmp_path / "test_key"
237
+ ssh_key.touch()
238
+ azure_provider.azure_config.ssh_key_path = ssh_key
239
+
224
240
  mock_run.side_effect = [
225
- Mock(returncode=0), # ssh-keygen
226
241
  Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
227
242
  Mock(returncode=1, stderr="QuotaExceeded: Core quota exceeded"), # az vm create
228
243
  ]
@@ -577,13 +592,18 @@ class TestSetup:
577
592
  mock_ssh_client: MagicMock,
578
593
  mock_run: MagicMock,
579
594
  azure_provider: AzureProvider,
595
+ tmp_path: Path,
580
596
  ) -> None:
581
597
  """Test full setup flow (create VM, wait SSH, get IP, install, config, test)."""
582
598
  mock_env_get.return_value = "test-api-key"
583
599
 
584
- # Mock ssh-keygen, resource group show, vm create, vm show
600
+ # Use existing SSH key to avoid depending on ~/.ssh/mcpbr_azure state
601
+ ssh_key = tmp_path / "test_key"
602
+ ssh_key.touch()
603
+ azure_provider.azure_config.ssh_key_path = ssh_key
604
+
605
+ # Mock resource group show, vm create, vm show (no ssh-keygen needed)
585
606
  mock_run.side_effect = [
586
- Mock(returncode=0), # ssh-keygen
587
607
  Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
588
608
  Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
589
609
  Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show (note: quoted string in JSON)
@@ -618,11 +638,16 @@ class TestSetup:
618
638
  mock_time: MagicMock,
619
639
  mock_run: MagicMock,
620
640
  azure_provider: AzureProvider,
641
+ tmp_path: Path,
621
642
  ) -> None:
622
643
  """Test setup failure rolls back VM creation."""
623
- # Mock ssh-keygen success, resource group show, VM creation success, IP retrieval failure
644
+ # Use existing SSH key to avoid depending on ~/.ssh/mcpbr_azure state
645
+ ssh_key = tmp_path / "test_key"
646
+ ssh_key.touch()
647
+ azure_provider.azure_config.ssh_key_path = ssh_key
648
+
649
+ # Mock resource group show, VM creation success, IP retrieval failure
624
650
  mock_run.side_effect = [
625
- Mock(returncode=0), # ssh-keygen
626
651
  Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
627
652
  Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
628
653
  Mock(returncode=1, stderr="VM not found"), # az vm show (failure)
@@ -687,11 +712,12 @@ class TestSetup:
687
712
  mock_ssh_client: MagicMock,
688
713
  mock_run: MagicMock,
689
714
  azure_provider: AzureProvider,
715
+ tmp_path: Path,
690
716
  ) -> None:
691
717
  """Test setup with generated SSH key."""
692
718
  mock_env_get.return_value = "test-api-key"
693
719
 
694
- # No SSH key configured
720
+ # No SSH key configured - redirect home to tmp_path so key doesn't exist
695
721
  azure_provider.azure_config.ssh_key_path = None
696
722
 
697
723
  mock_run.side_effect = [
@@ -717,7 +743,8 @@ class TestSetup:
717
743
  mock_sftp = MagicMock()
718
744
  mock_client.open_sftp.return_value = mock_sftp
719
745
 
720
- await azure_provider.setup()
746
+ with patch("mcpbr.infrastructure.azure.Path.home", return_value=tmp_path):
747
+ await azure_provider.setup()
721
748
 
722
749
  # Verify ssh-keygen was called
723
750
  ssh_keygen_call = mock_run.call_args_list[0]
@@ -793,12 +820,13 @@ class TestEnvironmentSetup:
793
820
 
794
821
  await azure_provider._install_dependencies()
795
822
 
796
- # Verify command was executed
797
- mock_client.exec_command.assert_called_once()
798
- cmd = mock_client.exec_command.call_args[0][0]
799
- assert "apt-get update" in cmd
800
- assert "docker" in cmd.lower()
801
- assert "pip3 install mcpbr" in cmd
823
+ # Verify all 4 steps were executed (Docker, Python, Node.js, mcpbr)
824
+ assert mock_client.exec_command.call_count == 4
825
+ all_cmds = [call[0][0] for call in mock_client.exec_command.call_args_list]
826
+ all_cmds_str = " ".join(all_cmds)
827
+ assert "apt-get update" in all_cmds_str
828
+ assert "docker" in all_cmds_str.lower()
829
+ assert "pip install mcpbr" in all_cmds_str
802
830
 
803
831
  async def test_install_dependencies_handles_failures_gracefully(
804
832
  self,
@@ -819,7 +847,8 @@ class TestEnvironmentSetup:
819
847
  # Should not raise - just log warning
820
848
  await azure_provider._install_dependencies()
821
849
 
822
- mock_client.exec_command.assert_called_once()
850
+ # All 4 steps still execute even if individual steps fail
851
+ assert mock_client.exec_command.call_count == 4
823
852
 
824
853
  async def test_install_dependencies_installs_docker(
825
854
  self,
@@ -839,8 +868,9 @@ class TestEnvironmentSetup:
839
868
 
840
869
  await azure_provider._install_dependencies()
841
870
 
842
- cmd = mock_client.exec_command.call_args[0][0]
843
- assert "get.docker.com" in cmd
871
+ # Docker install is the first step
872
+ all_cmds = [call[0][0] for call in mock_client.exec_command.call_args_list]
873
+ assert any("get.docker.com" in cmd for cmd in all_cmds)
844
874
 
845
875
  async def test_install_dependencies_installs_python_version(
846
876
  self,
@@ -879,8 +909,9 @@ class TestEnvironmentSetup:
879
909
 
880
910
  await azure_provider._install_dependencies()
881
911
 
882
- cmd = mock_client.exec_command.call_args[0][0]
883
- assert "pip3 install mcpbr" in cmd
912
+ # mcpbr install is the last step
913
+ all_cmds = [call[0][0] for call in mock_client.exec_command.call_args_list]
914
+ assert any("pip install mcpbr" in cmd for cmd in all_cmds)
884
915
 
885
916
 
886
917
  # ============================================================================
@@ -1209,13 +1240,16 @@ class TestUpdatedSetup:
1209
1240
  mock_ssh_client: MagicMock,
1210
1241
  mock_run: MagicMock,
1211
1242
  azure_provider: AzureProvider,
1243
+ tmp_path: Path,
1212
1244
  ) -> None:
1213
1245
  """Test full setup flow includes dependency installation."""
1214
1246
  mock_env_get.return_value = "test-api-key"
1247
+ ssh_key = tmp_path / "test_key"
1248
+ ssh_key.touch()
1249
+ azure_provider.azure_config.ssh_key_path = ssh_key
1215
1250
 
1216
- # Mock subprocess calls
1251
+ # Mock subprocess calls (no ssh-keygen needed with existing key)
1217
1252
  mock_run.side_effect = [
1218
- Mock(returncode=0), # ssh-keygen
1219
1253
  Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
1220
1254
  Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
1221
1255
  Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show
@@ -1259,12 +1293,15 @@ class TestUpdatedSetup:
1259
1293
  mock_ssh_client: MagicMock,
1260
1294
  mock_run: MagicMock,
1261
1295
  azure_provider: AzureProvider,
1296
+ tmp_path: Path,
1262
1297
  ) -> None:
1263
1298
  """Test full setup flow includes config transfer."""
1264
1299
  mock_env_get.return_value = "test-api-key"
1300
+ ssh_key = tmp_path / "test_key"
1301
+ ssh_key.touch()
1302
+ azure_provider.azure_config.ssh_key_path = ssh_key
1265
1303
 
1266
1304
  mock_run.side_effect = [
1267
- Mock(returncode=0), # ssh-keygen
1268
1305
  Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
1269
1306
  Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
1270
1307
  Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show
@@ -1301,12 +1338,15 @@ class TestUpdatedSetup:
1301
1338
  mock_ssh_client: MagicMock,
1302
1339
  mock_run: MagicMock,
1303
1340
  azure_provider: AzureProvider,
1341
+ tmp_path: Path,
1304
1342
  ) -> None:
1305
1343
  """Test full setup flow includes env var export."""
1306
1344
  mock_env_get.return_value = "test-api-key"
1345
+ ssh_key = tmp_path / "test_key"
1346
+ ssh_key.touch()
1347
+ azure_provider.azure_config.ssh_key_path = ssh_key
1307
1348
 
1308
1349
  mock_run.side_effect = [
1309
- Mock(returncode=0), # ssh-keygen
1310
1350
  Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
1311
1351
  Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
1312
1352
  Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show
@@ -1343,12 +1383,15 @@ class TestUpdatedSetup:
1343
1383
  mock_ssh_client: MagicMock,
1344
1384
  mock_run: MagicMock,
1345
1385
  azure_provider: AzureProvider,
1386
+ tmp_path: Path,
1346
1387
  ) -> None:
1347
1388
  """Test full setup flow includes test task."""
1348
1389
  mock_env_get.return_value = "test-api-key"
1390
+ ssh_key = tmp_path / "test_key"
1391
+ ssh_key.touch()
1392
+ azure_provider.azure_config.ssh_key_path = ssh_key
1349
1393
 
1350
1394
  mock_run.side_effect = [
1351
- Mock(returncode=0), # ssh-keygen
1352
1395
  Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
1353
1396
  Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
1354
1397
  Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show
@@ -1385,12 +1428,15 @@ class TestUpdatedSetup:
1385
1428
  mock_ssh_client: MagicMock,
1386
1429
  mock_run: MagicMock,
1387
1430
  azure_provider: AzureProvider,
1431
+ tmp_path: Path,
1388
1432
  ) -> None:
1389
1433
  """Test setup fails if test task fails."""
1390
1434
  mock_env_get.return_value = "test-api-key"
1435
+ ssh_key = tmp_path / "test_key"
1436
+ ssh_key.touch()
1437
+ azure_provider.azure_config.ssh_key_path = ssh_key
1391
1438
 
1392
1439
  mock_run.side_effect = [
1393
- Mock(returncode=0), # ssh-keygen
1394
1440
  Mock(returncode=0, stdout='{"id": "rg-id"}'), # az group show
1395
1441
  Mock(returncode=0, stdout='{"id": "vm-id"}'), # az vm create
1396
1442
  Mock(returncode=0, stdout='"1.2.3.4"'), # az vm show