mcpbr 0.5.2__tar.gz → 0.5.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/marketplace.json +2 -2
  2. {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/package.json +1 -1
  3. {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/plugin.json +1 -1
  4. {mcpbr-0.5.2 → mcpbr-0.5.4}/PKG-INFO +1 -1
  5. {mcpbr-0.5.2 → mcpbr-0.5.4}/package.json +1 -1
  6. {mcpbr-0.5.2 → mcpbr-0.5.4}/pyproject.toml +1 -1
  7. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/__init__.py +1 -1
  8. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/evaluation.py +75 -54
  9. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/harness.py +145 -3
  10. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/azure.py +4 -3
  11. mcpbr-0.5.4/tests/test_cold_start.py +130 -0
  12. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_mcp_logging.py +11 -9
  13. {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude/settings.json +0 -0
  14. {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/README.md +0 -0
  15. {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/skills/README.md +0 -0
  16. {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/skills/benchmark-swe-lite/SKILL.md +0 -0
  17. {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/skills/mcpbr-config/SKILL.md +0 -0
  18. {mcpbr-0.5.2 → mcpbr-0.5.4}/.claude-plugin/skills/mcpbr-eval/SKILL.md +0 -0
  19. {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  20. {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  21. {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  22. {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  23. {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/dependabot.yml +0 -0
  24. {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/release-drafter.yml +0 -0
  25. {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/workflows/ci.yml +0 -0
  26. {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/workflows/post-release-bump.yml +0 -0
  27. {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/workflows/publish-npm.yml +0 -0
  28. {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/workflows/publish.yml +0 -0
  29. {mcpbr-0.5.2 → mcpbr-0.5.4}/.github/workflows/release-drafter.yml +0 -0
  30. {mcpbr-0.5.2 → mcpbr-0.5.4}/.gitignore +0 -0
  31. {mcpbr-0.5.2 → mcpbr-0.5.4}/.pre-commit-config.yaml +0 -0
  32. {mcpbr-0.5.2 → mcpbr-0.5.4}/AGENTS.md +0 -0
  33. {mcpbr-0.5.2 → mcpbr-0.5.4}/CHANGELOG.md +0 -0
  34. {mcpbr-0.5.2 → mcpbr-0.5.4}/CLAUDE.md +0 -0
  35. {mcpbr-0.5.2 → mcpbr-0.5.4}/CODE_OF_CONDUCT.md +0 -0
  36. {mcpbr-0.5.2 → mcpbr-0.5.4}/CONTRIBUTING.md +0 -0
  37. {mcpbr-0.5.2 → mcpbr-0.5.4}/Dockerfile +0 -0
  38. {mcpbr-0.5.2 → mcpbr-0.5.4}/HUMANEVAL_FIX_SUMMARY.md +0 -0
  39. {mcpbr-0.5.2 → mcpbr-0.5.4}/LICENSE +0 -0
  40. {mcpbr-0.5.2 → mcpbr-0.5.4}/Makefile +0 -0
  41. {mcpbr-0.5.2 → mcpbr-0.5.4}/PR_SUMMARY.md +0 -0
  42. {mcpbr-0.5.2 → mcpbr-0.5.4}/README.md +0 -0
  43. {mcpbr-0.5.2 → mcpbr-0.5.4}/SECURITY.md +0 -0
  44. {mcpbr-0.5.2 → mcpbr-0.5.4}/assets/mcpbr-demo.gif +0 -0
  45. {mcpbr-0.5.2 → mcpbr-0.5.4}/assets/mcpbr-eval-results.png +0 -0
  46. {mcpbr-0.5.2 → mcpbr-0.5.4}/assets/mcpbr-logo.jpg +0 -0
  47. {mcpbr-0.5.2 → mcpbr-0.5.4}/bin/mcpbr.js +0 -0
  48. {mcpbr-0.5.2 → mcpbr-0.5.4}/config/example.yaml +0 -0
  49. {mcpbr-0.5.2 → mcpbr-0.5.4}/config/humaneval.yaml +0 -0
  50. {mcpbr-0.5.2 → mcpbr-0.5.4}/config/supermodel.yaml +0 -0
  51. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/azure-config-example.yaml +0 -0
  52. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/custom-benchmark.yaml +0 -0
  53. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/env-vars-example.yaml +0 -0
  54. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/README.md +0 -0
  55. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/base-config.yaml +0 -0
  56. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/dev-config.yaml +0 -0
  57. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/multi-extend-config.yaml +0 -0
  58. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/production-config.yaml +0 -0
  59. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/inheritance/shared-mcp-settings.yaml +0 -0
  60. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/local-config-example.yaml +0 -0
  61. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/quick-start/gsm8k-math-reasoning.yaml +0 -0
  62. {mcpbr-0.5.2 → mcpbr-0.5.4}/examples/quick-start/test-your-mcp-server.yaml +0 -0
  63. {mcpbr-0.5.2 → mcpbr-0.5.4}/install.sh +0 -0
  64. {mcpbr-0.5.2 → mcpbr-0.5.4}/requirements.txt +0 -0
  65. {mcpbr-0.5.2 → mcpbr-0.5.4}/scripts/sync_version.py +0 -0
  66. {mcpbr-0.5.2 → mcpbr-0.5.4}/scripts/validate_plugin_manifests.py +0 -0
  67. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/__main__.py +0 -0
  68. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/agent.py +0 -0
  69. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/__init__.py +0 -0
  70. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/adversarial.py +0 -0
  71. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/agentbench.py +0 -0
  72. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/aider_polyglot.py +0 -0
  73. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/apps.py +0 -0
  74. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/arc.py +0 -0
  75. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/base.py +0 -0
  76. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/bigbench_hard.py +0 -0
  77. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/bigcodebench.py +0 -0
  78. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/codecontests.py +0 -0
  79. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/codereval.py +0 -0
  80. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/custom.py +0 -0
  81. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/cybergym.py +0 -0
  82. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/gaia.py +0 -0
  83. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/gsm8k.py +0 -0
  84. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/hellaswag.py +0 -0
  85. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/humaneval.py +0 -0
  86. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/intercode.py +0 -0
  87. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/leetcode.py +0 -0
  88. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/longbench.py +0 -0
  89. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/math_benchmark.py +0 -0
  90. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/mbpp.py +0 -0
  91. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/mcptoolbench.py +0 -0
  92. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/mlagentbench.py +0 -0
  93. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/mmmu.py +0 -0
  94. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/repoqa.py +0 -0
  95. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/swebench.py +0 -0
  96. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/terminalbench.py +0 -0
  97. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/toolbench.py +0 -0
  98. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/truthfulqa.py +0 -0
  99. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/benchmarks/webarena.py +0 -0
  100. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/cache.py +0 -0
  101. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/cli.py +0 -0
  102. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/config.py +0 -0
  103. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/config_inheritance.py +0 -0
  104. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/config_migration.py +0 -0
  105. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/config_validator.py +0 -0
  106. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/config_wizard.py +0 -0
  107. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/custom_metrics.py +0 -0
  108. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/dashboard.py +0 -0
  109. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/brave-search.yaml +0 -0
  110. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/filesystem.yaml +0 -0
  111. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/github.yaml +0 -0
  112. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/google-maps.yaml +0 -0
  113. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/postgres.yaml +0 -0
  114. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/slack.yaml +0 -0
  115. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/data/templates/sqlite.yaml +0 -0
  116. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/dataset_streaming.py +0 -0
  117. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/dataset_versioning.py +0 -0
  118. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/docker_cache.py +0 -0
  119. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/docker_env.py +0 -0
  120. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/docker_prewarm.py +0 -0
  121. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/dry_run.py +0 -0
  122. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/env_expansion.py +0 -0
  123. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/failure_analysis.py +0 -0
  124. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/few_shot.py +0 -0
  125. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/formatting.py +0 -0
  126. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/gpu_support.py +0 -0
  127. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/harnesses.py +0 -0
  128. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/incremental_save.py +0 -0
  129. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/__init__.py +0 -0
  130. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/azure_health.py +0 -0
  131. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/base.py +0 -0
  132. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/local.py +0 -0
  133. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/infrastructure/manager.py +0 -0
  134. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/junit_reporter.py +0 -0
  135. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/latency_metrics.py +0 -0
  136. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/log_formatter.py +0 -0
  137. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/models.py +0 -0
  138. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/output_validator.py +0 -0
  139. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/preflight.py +0 -0
  140. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/pricing.py +0 -0
  141. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/profiler.py +0 -0
  142. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/providers.py +0 -0
  143. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/regression.py +0 -0
  144. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/reporting.py +0 -0
  145. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/resource_limits.py +0 -0
  146. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/result_streaming.py +0 -0
  147. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/sampling.py +0 -0
  148. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/schema.py +0 -0
  149. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/smoke_test.py +0 -0
  150. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/state_tracker.py +0 -0
  151. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/statistics.py +0 -0
  152. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/streaming.py +0 -0
  153. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/swebench_test_specs.py +0 -0
  154. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/task_batching.py +0 -0
  155. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/task_scheduler.py +0 -0
  156. {mcpbr-0.5.2 → mcpbr-0.5.4}/src/mcpbr/templates.py +0 -0
  157. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/__init__.py +0 -0
  158. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/__init__.py +0 -0
  159. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_azure.py +0 -0
  160. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_azure_health.py +0 -0
  161. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_base.py +0 -0
  162. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_cli_infrastructure.py +0 -0
  163. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_config.py +0 -0
  164. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_local.py +0 -0
  165. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/infrastructure/test_manager.py +0 -0
  166. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_adversarial_benchmark.py +0 -0
  167. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_agent.py +0 -0
  168. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_benchmark_filtering.py +0 -0
  169. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_benchmark_integration.py +0 -0
  170. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_benchmarks.py +0 -0
  171. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_build_test_command.py +0 -0
  172. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_cache.py +0 -0
  173. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_claude_plugin.py +0 -0
  174. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_cli_templates.py +0 -0
  175. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_comparison_aggregation.py +0 -0
  176. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_comparison_config.py +0 -0
  177. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_comparison_integration.py +0 -0
  178. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_comparison_reporting.py +0 -0
  179. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config.py +0 -0
  180. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_env_vars.py +0 -0
  181. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_inheritance.py +0 -0
  182. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_migration.py +0 -0
  183. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_validator.py +0 -0
  184. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_validator_inheritance.py +0 -0
  185. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_config_wizard.py +0 -0
  186. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_cost_calculation.py +0 -0
  187. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_custom_benchmark.py +0 -0
  188. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_custom_metrics.py +0 -0
  189. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_dashboard.py +0 -0
  190. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_dataset_streaming.py +0 -0
  191. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_dataset_versioning.py +0 -0
  192. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_default_logging.py +0 -0
  193. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_docker_cache.py +0 -0
  194. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_docker_cleanup.py +0 -0
  195. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_docker_label_fix.py +0 -0
  196. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_docker_prewarm.py +0 -0
  197. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_docker_retry.py +0 -0
  198. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_dry_run.py +0 -0
  199. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_env_expansion.py +0 -0
  200. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_error_messages.py +0 -0
  201. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_eval_reliability.py +0 -0
  202. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_evaluation.py +0 -0
  203. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_exit_codes.py +0 -0
  204. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_export.py +0 -0
  205. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_failure_analysis.py +0 -0
  206. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_few_shot.py +0 -0
  207. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_formatting.py +0 -0
  208. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_git_diff_new_files.py +0 -0
  209. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_gpu_support.py +0 -0
  210. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_incremental_save.py +0 -0
  211. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_integration.py +0 -0
  212. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_junit_reporter.py +0 -0
  213. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_latency_metrics.py +0 -0
  214. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_log_formatter_read_tool.py +0 -0
  215. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_longbench_benchmark.py +0 -0
  216. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_mcp_health_check.py +0 -0
  217. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_mcptoolbench_benchmark.py +0 -0
  218. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_mmmu_benchmark.py +0 -0
  219. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_models.py +0 -0
  220. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_output_validator.py +0 -0
  221. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_parse_errors.py +0 -0
  222. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_preflight.py +0 -0
  223. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_pricing.py +0 -0
  224. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_profiler.py +0 -0
  225. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_regression.py +0 -0
  226. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_reporting.py +0 -0
  227. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_resource_limits.py +0 -0
  228. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_result_streaming.py +0 -0
  229. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_runtime_tracking.py +0 -0
  230. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_sampling.py +0 -0
  231. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_schema.py +0 -0
  232. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_setup_command_fixes.py +0 -0
  233. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_smoke_test.py +0 -0
  234. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_state_tracker.py +0 -0
  235. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_statistics.py +0 -0
  236. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_statistics_integration.py +0 -0
  237. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_streaming.py +0 -0
  238. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_string_concat_bug.py +0 -0
  239. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_task_batching.py +0 -0
  240. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_task_scheduler.py +0 -0
  241. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_templates.py +0 -0
  242. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_thinking_budget.py +0 -0
  243. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_timeout_tracking.py +0 -0
  244. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_tool_failure_tracking.py +0 -0
  245. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_trial_mode.py +0 -0
  246. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_type_safety.py +0 -0
  247. {mcpbr-0.5.2 → mcpbr-0.5.4}/tests/test_xml_export.py +0 -0
  248. {mcpbr-0.5.2 → mcpbr-0.5.4}/uv.lock +0 -0
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "$schema": "https://anthropic.com/claude-code/marketplace.schema.json",
3
3
  "name": "mcpbr",
4
- "version": "0.5.2",
4
+ "version": "0.5.4",
5
5
  "description": "mcpbr - MCP Benchmark Runner plugin marketplace",
6
6
  "owner": {
7
7
  "name": "mcpbr Contributors",
@@ -11,7 +11,7 @@
11
11
  {
12
12
  "name": "mcpbr",
13
13
  "description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
14
- "version": "0.5.2",
14
+ "version": "0.5.4",
15
15
  "author": {
16
16
  "name": "mcpbr Contributors"
17
17
  },
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@greynewell/mcpbr-claude-plugin",
3
- "version": "0.5.2",
3
+ "version": "0.5.4",
4
4
  "description": "Claude Code plugin for mcpbr - Expert benchmark runner for MCP servers with specialized skills",
5
5
  "keywords": [
6
6
  "claude-code",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mcpbr",
3
- "version": "0.5.2",
3
+ "version": "0.5.4",
4
4
  "description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
5
5
  "schema_version": "1.0"
6
6
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcpbr
3
- Version: 0.5.2
3
+ Version: 0.5.4
4
4
  Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
5
5
  Project-URL: Homepage, https://github.com/greynewell/mcpbr
6
6
  Project-URL: Repository, https://github.com/greynewell/mcpbr
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@greynewell/mcpbr",
3
- "version": "0.5.2",
3
+ "version": "0.5.4",
4
4
  "description": "Model Context Protocol Benchmark Runner - CLI tool for evaluating MCP servers",
5
5
  "keywords": [
6
6
  "mcpbr",
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "mcpbr"
7
- version = "0.5.2"
7
+ version = "0.5.4"
8
8
  description = "Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -3,4 +3,4 @@
3
3
  A benchmark runner for evaluating MCP servers against SWE-bench tasks.
4
4
  """
5
5
 
6
- __version__ = "0.5.1"
6
+ __version__ = "0.5.4"
@@ -93,43 +93,52 @@ async def apply_patch(
93
93
 
94
94
  workdir = workdir or env.workdir
95
95
 
96
- # Reset repository to clean state before applying patch
97
- # The agent modified files directly, so we need to restore HEAD state
98
- await env.exec_command("git reset --hard HEAD", timeout=30, workdir=workdir)
99
- await env.exec_command("git clean -fd", timeout=30, workdir=workdir)
100
-
101
- await env.write_file("fix.patch", patch, workdir=workdir)
96
+ # Use longer timeouts for git operations under concurrent load,
97
+ # Docker exec can be slow and 30s is insufficient (#399).
98
+ try:
99
+ # Reset repository to clean state before applying patch
100
+ # The agent modified files directly, so we need to restore HEAD state
101
+ await env.exec_command("git reset --hard HEAD", timeout=120, workdir=workdir)
102
+ await env.exec_command("git clean -fd", timeout=120, workdir=workdir)
102
103
 
103
- exit_code, stdout, stderr = await env.exec_command(
104
- "git apply --check fix.patch",
105
- timeout=30,
106
- workdir=workdir,
107
- )
104
+ await env.write_file("fix.patch", patch, workdir=workdir)
108
105
 
109
- if exit_code != 0:
110
- exit_code2, stdout2, stderr2 = await env.exec_command(
111
- "git apply --check -3 fix.patch",
112
- timeout=30,
113
- workdir=workdir,
114
- )
115
- if exit_code2 != 0:
116
- return False, f"Patch does not apply: {stderr or stderr2}"
117
106
  exit_code, stdout, stderr = await env.exec_command(
118
- "git apply -3 fix.patch",
119
- timeout=30,
120
- workdir=workdir,
121
- )
122
- else:
123
- exit_code, stdout, stderr = await env.exec_command(
124
- "git apply fix.patch",
125
- timeout=30,
107
+ "git apply --check fix.patch",
108
+ timeout=120,
126
109
  workdir=workdir,
127
110
  )
128
111
 
129
- if exit_code != 0:
130
- return False, f"Failed to apply patch: {stderr}"
112
+ if exit_code != 0:
113
+ exit_code2, stdout2, stderr2 = await env.exec_command(
114
+ "git apply --check -3 fix.patch",
115
+ timeout=120,
116
+ workdir=workdir,
117
+ )
118
+ if exit_code2 != 0:
119
+ return False, f"Patch does not apply: {stderr or stderr2}"
120
+ exit_code, stdout, stderr = await env.exec_command(
121
+ "git apply -3 fix.patch",
122
+ timeout=120,
123
+ workdir=workdir,
124
+ )
125
+ else:
126
+ exit_code, stdout, stderr = await env.exec_command(
127
+ "git apply fix.patch",
128
+ timeout=120,
129
+ workdir=workdir,
130
+ )
131
131
 
132
- return True, ""
132
+ if exit_code != 0:
133
+ return False, f"Failed to apply patch: {stderr}"
134
+
135
+ return True, ""
136
+
137
+ except (TimeoutError, asyncio.TimeoutError):
138
+ # Catch exec_command timeouts here so they don't bubble up as
139
+ # asyncio.TimeoutError to the harness, which would misclassify
140
+ # this as an agent/eval timeout (#399).
141
+ return False, "Docker exec timed out during patch application"
133
142
 
134
143
 
135
144
  async def run_tests(
@@ -282,38 +291,43 @@ async def _apply_test_patch(
282
291
 
283
292
  workdir = workdir or env.workdir
284
293
 
285
- await env.write_file("test.patch", test_patch, workdir=workdir)
286
-
287
- exit_code, stdout, stderr = await env.exec_command(
288
- "git apply --check test.patch",
289
- timeout=30,
290
- workdir=workdir,
291
- )
294
+ try:
295
+ await env.write_file("test.patch", test_patch, workdir=workdir)
292
296
 
293
- if exit_code != 0:
294
297
  exit_code, stdout, stderr = await env.exec_command(
295
- "git apply --check -3 test.patch",
296
- timeout=30,
298
+ "git apply --check test.patch",
299
+ timeout=120,
297
300
  workdir=workdir,
298
301
  )
302
+
303
+ if exit_code != 0:
304
+ exit_code, stdout, stderr = await env.exec_command(
305
+ "git apply --check -3 test.patch",
306
+ timeout=120,
307
+ workdir=workdir,
308
+ )
309
+ if exit_code != 0:
310
+ return True, ""
311
+ exit_code, stdout, stderr = await env.exec_command(
312
+ "git apply -3 test.patch",
313
+ timeout=120,
314
+ workdir=workdir,
315
+ )
316
+ else:
317
+ exit_code, stdout, stderr = await env.exec_command(
318
+ "git apply test.patch",
319
+ timeout=120,
320
+ workdir=workdir,
321
+ )
322
+
299
323
  if exit_code != 0:
300
324
  return True, ""
301
- exit_code, stdout, stderr = await env.exec_command(
302
- "git apply -3 test.patch",
303
- timeout=30,
304
- workdir=workdir,
305
- )
306
- else:
307
- exit_code, stdout, stderr = await env.exec_command(
308
- "git apply test.patch",
309
- timeout=30,
310
- workdir=workdir,
311
- )
312
325
 
313
- if exit_code != 0:
314
326
  return True, ""
315
327
 
316
- return True, ""
328
+ except (TimeoutError, asyncio.TimeoutError):
329
+ # Don't let exec timeouts bubble up to the harness (#399)
330
+ return True, ""
317
331
 
318
332
 
319
333
  async def evaluate_patch(
@@ -356,7 +370,14 @@ async def evaluate_patch(
356
370
 
357
371
  # Skip dependency installation for pre-built images (already done)
358
372
  if not env.uses_prebuilt:
359
- await _install_dependencies(env)
373
+ try:
374
+ await _install_dependencies(env)
375
+ except (TimeoutError, asyncio.TimeoutError):
376
+ return EvaluationResult(
377
+ resolved=False,
378
+ patch_applied=True,
379
+ error="Docker exec timed out during dependency installation",
380
+ )
360
381
 
361
382
  repo = task.get("repo")
362
383
 
@@ -1,6 +1,7 @@
1
1
  """Main evaluation harness orchestrating parallel task execution."""
2
2
 
3
3
  import asyncio
4
+ import logging
4
5
  import time
5
6
  from dataclasses import dataclass
6
7
  from datetime import datetime, timezone
@@ -29,6 +30,7 @@ from .pricing import calculate_cost
29
30
  from .profiler import PerformanceProfiler
30
31
 
31
32
  console = Console()
33
+ logger = logging.getLogger(__name__)
32
34
 
33
35
 
34
36
  class SimpleNamespace:
@@ -56,6 +58,57 @@ def dict_to_namespace(data: Any) -> Any:
56
58
  return data
57
59
 
58
60
 
61
+ # -- Cold-start mitigation helpers (#401) ------------------------------------
62
+
63
+ # Seconds between each task launch in the first concurrent batch.
64
+ _STAGGER_INTERVAL = 1.0
65
+
66
+
67
+ def _stagger_delay(task_index: int, max_concurrent: int) -> float:
68
+ """Return the startup delay for a task to avoid cold-start contention.
69
+
70
+ Only the first batch (indices 0 .. max_concurrent-1) is staggered.
71
+ The very first task starts immediately; subsequent tasks in the batch
72
+ get an increasing delay so Docker image pulls and container creation
73
+ don't all hit at once.
74
+
75
+ Args:
76
+ task_index: Zero-based index of the task in launch order.
77
+ max_concurrent: Semaphore size / max parallelism.
78
+
79
+ Returns:
80
+ Delay in seconds (0.0 means start immediately).
81
+ """
82
+ if max_concurrent <= 1:
83
+ return 0.0
84
+ # Only stagger the first batch
85
+ if task_index >= max_concurrent:
86
+ return 0.0
87
+ return task_index * _STAGGER_INTERVAL
88
+
89
+
90
+ def _should_retry_zero_iteration(result: dict[str, Any]) -> bool:
91
+ """Check whether a task result indicates a cold-start failure worth retrying.
92
+
93
+ A cold-start failure is characterised by zero iterations AND zero tokens
94
+ AND a timeout status — the agent never actually ran.
95
+
96
+ Args:
97
+ result: Single-run result dict from _run_mcp_evaluation or _run_baseline_evaluation.
98
+
99
+ Returns:
100
+ True if the result looks like a cold-start failure.
101
+ """
102
+ if result.get("status") != "timeout":
103
+ return False
104
+ if result.get("iterations", -1) != 0:
105
+ return False
106
+ tokens = result.get("tokens", {})
107
+ if tokens.get("input", -1) != 0 or tokens.get("output", -1) != 0:
108
+ return False
109
+ return True
110
+
111
+
59
112
  @dataclass
60
113
  class TaskResult:
61
114
  """Result for a single task."""
@@ -302,6 +355,24 @@ async def run_single_task(
302
355
  mcp_server_config=config.mcp_server_a,
303
356
  server_name="server_a",
304
357
  )
358
+ # Retry once on cold-start failure (#401)
359
+ if result.mcp_server_a and _should_retry_zero_iteration(result.mcp_server_a):
360
+ logger.info(
361
+ "Retrying MCP server_a task %s (zero-iteration cold-start)", instance_id
362
+ )
363
+ result.mcp_server_a = await _run_mcp_evaluation(
364
+ task,
365
+ config,
366
+ docker_manager,
367
+ benchmark,
368
+ verbose,
369
+ verbosity,
370
+ mcp_log_writer_a if mcp_log_writer_a else log_file,
371
+ cache,
372
+ mcp_logs_dir,
373
+ mcp_server_config=config.mcp_server_a,
374
+ server_name="server_a",
375
+ )
305
376
  finally:
306
377
  if mcp_log_writer_a:
307
378
  mcp_log_writer_a.close()
@@ -324,6 +395,24 @@ async def run_single_task(
324
395
  mcp_server_config=config.mcp_server_b,
325
396
  server_name="server_b",
326
397
  )
398
+ # Retry once on cold-start failure (#401)
399
+ if result.mcp_server_b and _should_retry_zero_iteration(result.mcp_server_b):
400
+ logger.info(
401
+ "Retrying MCP server_b task %s (zero-iteration cold-start)", instance_id
402
+ )
403
+ result.mcp_server_b = await _run_mcp_evaluation(
404
+ task,
405
+ config,
406
+ docker_manager,
407
+ benchmark,
408
+ verbose,
409
+ verbosity,
410
+ mcp_log_writer_b if mcp_log_writer_b else log_file,
411
+ cache,
412
+ mcp_logs_dir,
413
+ mcp_server_config=config.mcp_server_b,
414
+ server_name="server_b",
415
+ )
327
416
  finally:
328
417
  if mcp_log_writer_b:
329
418
  mcp_log_writer_b.close()
@@ -344,6 +433,20 @@ async def run_single_task(
344
433
  cache,
345
434
  mcp_logs_dir,
346
435
  )
436
+ # Retry once on cold-start failure (#401)
437
+ if result.mcp and _should_retry_zero_iteration(result.mcp):
438
+ logger.info("Retrying MCP task %s (zero-iteration cold-start)", instance_id)
439
+ result.mcp = await _run_mcp_evaluation(
440
+ task,
441
+ config,
442
+ docker_manager,
443
+ benchmark,
444
+ verbose,
445
+ verbosity,
446
+ mcp_log_writer if mcp_log_writer else log_file,
447
+ cache,
448
+ mcp_logs_dir,
449
+ )
347
450
  finally:
348
451
  if mcp_log_writer:
349
452
  mcp_log_writer.close()
@@ -363,6 +466,19 @@ async def run_single_task(
363
466
  baseline_log_writer if baseline_log_writer else log_file,
364
467
  cache,
365
468
  )
469
+ # Retry once on cold-start failure (#401)
470
+ if result.baseline and _should_retry_zero_iteration(result.baseline):
471
+ logger.info("Retrying baseline task %s (zero-iteration cold-start)", instance_id)
472
+ result.baseline = await _run_baseline_evaluation(
473
+ task,
474
+ config,
475
+ docker_manager,
476
+ benchmark,
477
+ verbose,
478
+ verbosity,
479
+ baseline_log_writer if baseline_log_writer else log_file,
480
+ cache,
481
+ )
366
482
  finally:
367
483
  if baseline_log_writer:
368
484
  baseline_log_writer.close()
@@ -539,7 +655,15 @@ async def _run_mcp_evaluation(
539
655
  if env:
540
656
  # Track Docker teardown time
541
657
  teardown_start = time.time()
542
- await env.cleanup()
658
+ try:
659
+ await asyncio.wait_for(env.cleanup(), timeout=60)
660
+ except (asyncio.TimeoutError, Exception) as cleanup_err:
661
+ logger.warning("Container cleanup failed for MCP task: %s", cleanup_err)
662
+ try:
663
+ if hasattr(env, "container") and env.container:
664
+ env.container.remove(force=True)
665
+ except Exception:
666
+ pass
543
667
  if profiler:
544
668
  teardown_end = time.time()
545
669
  profiler.record_docker_teardown(teardown_end - teardown_start)
@@ -695,7 +819,15 @@ async def _run_baseline_evaluation(
695
819
  if env:
696
820
  # Track Docker teardown time
697
821
  teardown_start = time.time()
698
- await env.cleanup()
822
+ try:
823
+ await asyncio.wait_for(env.cleanup(), timeout=60)
824
+ except (asyncio.TimeoutError, Exception) as cleanup_err:
825
+ logger.warning("Container cleanup failed for baseline task: %s", cleanup_err)
826
+ try:
827
+ if hasattr(env, "container") and env.container:
828
+ env.container.remove(force=True)
829
+ except Exception:
830
+ pass
699
831
  if profiler:
700
832
  teardown_end = time.time()
701
833
  profiler.record_docker_teardown(teardown_end - teardown_start)
@@ -1013,9 +1145,10 @@ async def run_evaluation(
1013
1145
  semaphore = asyncio.Semaphore(config.max_concurrent)
1014
1146
  budget_exceeded = False
1015
1147
  current_cost = 0.0
1148
+ _task_launch_counter = 0
1016
1149
 
1017
1150
  async def run_with_semaphore(task: dict[str, Any]) -> TaskResult | None:
1018
- nonlocal current_cost, budget_exceeded
1151
+ nonlocal current_cost, budget_exceeded, _task_launch_counter
1019
1152
 
1020
1153
  # Check budget before running task
1021
1154
  if config.budget and current_cost >= config.budget:
@@ -1023,6 +1156,15 @@ async def run_evaluation(
1023
1156
  return None
1024
1157
 
1025
1158
  async with semaphore:
1159
+ # Stagger first-batch launches to avoid cold-start contention (#401).
1160
+ # Delay is inside the semaphore so the sleeping task holds its slot
1161
+ # and later tasks cannot leapfrog ahead of the first batch.
1162
+ my_index = _task_launch_counter
1163
+ _task_launch_counter += 1
1164
+ delay = _stagger_delay(my_index, config.max_concurrent)
1165
+ if delay > 0:
1166
+ await asyncio.sleep(delay)
1167
+
1026
1168
  result = await run_single_task(
1027
1169
  task,
1028
1170
  config,
@@ -17,6 +17,7 @@ except ImportError:
17
17
 
18
18
  from rich.console import Console
19
19
 
20
+ from .. import __version__
20
21
  from ..config import HarnessConfig
21
22
  from .base import InfrastructureProvider
22
23
 
@@ -342,9 +343,9 @@ class AzureProvider(InfrastructureProvider):
342
343
  else:
343
344
  console.print("[green]✓ Node.js installed[/green]")
344
345
 
345
- # Step 4: Install mcpbr
346
- console.print("[cyan]Installing mcpbr...[/cyan]")
347
- step4_cmd = f"python{py_ver} -m pip install mcpbr"
346
+ # Step 4: Install mcpbr (pin to local version)
347
+ console.print(f"[cyan]Installing mcpbr=={__version__}...[/cyan]")
348
+ step4_cmd = f"python{py_ver} -m pip install mcpbr=={__version__}"
348
349
  exit_code, _stdout, stderr = await self._ssh_exec(step4_cmd, timeout=300)
349
350
  if exit_code != 0:
350
351
  console.print(f"[yellow]⚠ mcpbr install issues: {stderr[:300]}[/yellow]")
@@ -0,0 +1,130 @@
1
+ """Tests for cold-start staggering and zero-iteration retry logic."""
2
+
3
+ import asyncio
4
+
5
+ import pytest
6
+
7
+ from mcpbr.harness import TaskResult, _should_retry_zero_iteration, _stagger_delay
8
+
9
+
10
+ class TestStaggeredStarts:
11
+ """Verify that concurrent task launches are staggered to avoid cold-start failures."""
12
+
13
+ @pytest.mark.asyncio
14
+ async def test_tasks_are_staggered(self) -> None:
15
+ """First-batch tasks should not all start at the same instant.
16
+
17
+ When max_concurrent > 1, the semaphore wrapper should insert a small
18
+ delay between task launches so Docker isn't overwhelmed by simultaneous
19
+ image pulls and container startups.
20
+ """
21
+ launch_times: list[float] = []
22
+ loop = asyncio.get_running_loop()
23
+
24
+ async def fake_run_single_task(task):
25
+ launch_times.append(loop.time())
26
+ await asyncio.sleep(0.05) # Simulate brief work
27
+ return TaskResult(instance_id=f"task-{len(launch_times)}")
28
+
29
+ tasks = [{"instance_id": f"task-{i}"} for i in range(5)]
30
+ max_concurrent = 5 # All 5 could start at once without staggering
31
+
32
+ semaphore = asyncio.Semaphore(max_concurrent)
33
+ task_counter = 0
34
+
35
+ async def run_with_semaphore(task):
36
+ nonlocal task_counter
37
+ async with semaphore:
38
+ my_index = task_counter
39
+ task_counter += 1
40
+ delay = _stagger_delay(my_index, max_concurrent)
41
+ if delay > 0:
42
+ await asyncio.sleep(delay)
43
+ return await fake_run_single_task(task)
44
+
45
+ async_tasks = [asyncio.create_task(run_with_semaphore(t)) for t in tasks]
46
+ await asyncio.gather(*async_tasks)
47
+
48
+ assert len(launch_times) == 5
49
+
50
+ # The first and last task should be separated by at least some delay
51
+ spread = launch_times[-1] - launch_times[0]
52
+ assert spread > 0.1, (
53
+ f"Tasks launched with only {spread:.3f}s spread — expected staggering to space them out"
54
+ )
55
+
56
+ @pytest.mark.asyncio
57
+ async def test_stagger_delay_values(self) -> None:
58
+ """_stagger_delay should return increasing delays for the first batch."""
59
+ # First task: no delay
60
+ assert _stagger_delay(0, max_concurrent=5) == 0.0
61
+
62
+ # Subsequent first-batch tasks: increasing delay
63
+ d1 = _stagger_delay(1, max_concurrent=5)
64
+ d2 = _stagger_delay(2, max_concurrent=5)
65
+ assert d1 > 0
66
+ assert d2 > d1
67
+
68
+ # Tasks beyond the first batch: no delay
69
+ assert _stagger_delay(5, max_concurrent=5) == 0.0
70
+ assert _stagger_delay(10, max_concurrent=5) == 0.0
71
+
72
+ @pytest.mark.asyncio
73
+ async def test_stagger_delay_single_concurrent(self) -> None:
74
+ """With max_concurrent=1, no staggering is needed."""
75
+ assert _stagger_delay(0, max_concurrent=1) == 0.0
76
+ assert _stagger_delay(1, max_concurrent=1) == 0.0
77
+
78
+
79
+ class TestZeroIterationRetry:
80
+ """Verify that _should_retry_zero_iteration detects cold-start failures."""
81
+
82
+ @pytest.mark.asyncio
83
+ async def test_detects_cold_start_failure(self) -> None:
84
+ """Zero iterations + zero tokens + timeout = cold-start failure."""
85
+ zero_iter_result = {
86
+ "resolved": False,
87
+ "patch_applied": False,
88
+ "status": "timeout",
89
+ "error": "Timeout",
90
+ "tokens": {"input": 0, "output": 0},
91
+ "iterations": 0,
92
+ "tool_calls": 0,
93
+ "cost": 0.0,
94
+ "runtime_seconds": 236.0,
95
+ }
96
+ assert _should_retry_zero_iteration(zero_iter_result) is True
97
+
98
+ @pytest.mark.asyncio
99
+ async def test_completed_task_not_retried(self) -> None:
100
+ """A task that completed successfully should never be retried."""
101
+ good_result = {
102
+ "resolved": True,
103
+ "status": "completed",
104
+ "iterations": 20,
105
+ "tokens": {"input": 10000, "output": 5000},
106
+ }
107
+ assert _should_retry_zero_iteration(good_result) is False
108
+
109
+ @pytest.mark.asyncio
110
+ async def test_nonzero_iteration_timeout_not_retried(self) -> None:
111
+ """A timeout with real iterations is a genuine timeout, not cold-start."""
112
+ real_timeout = {
113
+ "resolved": False,
114
+ "status": "timeout",
115
+ "iterations": 5,
116
+ "tokens": {"input": 3000, "output": 1500},
117
+ }
118
+ assert _should_retry_zero_iteration(real_timeout) is False
119
+
120
+ @pytest.mark.asyncio
121
+ async def test_non_timeout_error_not_retried(self) -> None:
122
+ """Zero iterations from a non-timeout error should not trigger retry."""
123
+ error_result = {
124
+ "resolved": False,
125
+ "status": "error",
126
+ "error": "Something broke",
127
+ "iterations": 0,
128
+ "tokens": {"input": 0, "output": 0},
129
+ }
130
+ assert _should_retry_zero_iteration(error_result) is False
@@ -61,7 +61,8 @@ class TestMCPLogging:
61
61
  1,
62
62
  "",
63
63
  "npx: command not found",
64
- ), # MCP registration fails
64
+ ), # .mcp.json write fails
65
+ (0, "", ""), # chown .mcp.json
65
66
  (0, "", ""), # cleanup temp files
66
67
  ]
67
68
 
@@ -87,7 +88,7 @@ class TestMCPLogging:
87
88
 
88
89
  # Verify registration failure was caught
89
90
  assert result.success is False
90
- assert "MCP server registration failed" in result.error
91
+ assert "MCP config write failed" in result.error
91
92
  assert "npx: command not found" in result.error
92
93
 
93
94
  # Verify cleanup was called
@@ -110,6 +111,7 @@ class TestMCPLogging:
110
111
  (0, "", ""), # env file write
111
112
  (0, "", ""), # chown env
112
113
  (1, "Server starting...\nInitialization failed", "Error: Missing API key"),
114
+ (0, "", ""), # chown .mcp.json
113
115
  (0, "", ""), # cleanup
114
116
  ]
115
117
 
@@ -133,10 +135,10 @@ class TestMCPLogging:
133
135
  task_id="test_id",
134
136
  )
135
137
 
136
- # Verify both stderr and stdout are in error message
138
+ # Verify stderr is in error message and stdout is captured separately
137
139
  assert "Error: Missing API key" in result.error
138
- assert "Server starting" in result.error or "Initialization failed" in result.error
139
140
  assert result.stdout is not None
141
+ assert "Server starting" in result.stdout or "Initialization failed" in result.stdout
140
142
 
141
143
  @pytest.mark.asyncio
142
144
  async def test_mcp_timeout_cleanup(self, harness: ClaudeCodeHarness) -> None:
@@ -177,8 +179,7 @@ class TestMCPLogging:
177
179
 
178
180
  # Verify timeout was caught
179
181
  assert result.success is False
180
- assert "timed out after 60s" in result.error
181
- assert "failed to start or is hanging" in result.error
182
+ assert "Failed to write MCP configuration file" in result.error
182
183
 
183
184
  # Verify cleanup was called
184
185
  cleanup_calls = [
@@ -255,9 +256,10 @@ Debug: Cache miss for /workspace/"""
255
256
  (0, "", ""), # chown prompt
256
257
  (0, "", ""), # env file write
257
258
  (0, "", ""), # chown env
258
- (0, "MCP server registered successfully", ""), # MCP registration
259
- (0, "", ""), # MCP server remove (cleanup)
260
- (0, "", ""), # rm temp files (cleanup)
259
+ (0, "MCP server registered successfully", ""), # .mcp.json write
260
+ (0, "", ""), # chown .mcp.json
261
+ (0, "", ""), # rm .mcp.json (exit_code != 0 path)
262
+ (0, "", ""), # rm temp files (finally cleanup)
261
263
  ]
262
264
 
263
265
  # Mock streaming execution with our test output
File without changes
File without changes