hud-python 0.4.23__tar.gz → 0.4.25__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (203) hide show
  1. {hud_python-0.4.23 → hud_python-0.4.25}/PKG-INFO +1 -1
  2. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/base.py +48 -20
  3. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/claude.py +5 -1
  4. {hud_python-0.4.23 → hud_python-0.4.25}/hud/clients/mcp_use.py +9 -1
  5. hud_python-0.4.25/hud/clients/tests/test_mcp_use_retry.py +378 -0
  6. hud_python-0.4.25/hud/clients/utils/__init__.py +26 -0
  7. hud_python-0.4.25/hud/clients/utils/mcp_use_retry.py +201 -0
  8. hud_python-0.4.25/hud/clients/utils/retry.py +186 -0
  9. {hud_python-0.4.23 → hud_python-0.4.25}/hud/datasets/execution/parallel.py +25 -8
  10. {hud_python-0.4.23 → hud_python-0.4.25}/hud/otel/config.py +19 -2
  11. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/tests/test_version.py +1 -1
  12. {hud_python-0.4.23 → hud_python-0.4.25}/hud/version.py +1 -1
  13. {hud_python-0.4.23 → hud_python-0.4.25}/pyproject.toml +1 -1
  14. hud_python-0.4.23/hud/clients/utils/__init__.py +0 -1
  15. {hud_python-0.4.23 → hud_python-0.4.25}/.gitignore +0 -0
  16. {hud_python-0.4.23 → hud_python-0.4.25}/LICENSE +0 -0
  17. {hud_python-0.4.23 → hud_python-0.4.25}/README.md +0 -0
  18. {hud_python-0.4.23 → hud_python-0.4.25}/environments/README.md +0 -0
  19. {hud_python-0.4.23 → hud_python-0.4.25}/environments/browser/README.md +0 -0
  20. {hud_python-0.4.23 → hud_python-0.4.25}/environments/browser/apps/2048/README.md +0 -0
  21. {hud_python-0.4.23 → hud_python-0.4.25}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
  22. {hud_python-0.4.23 → hud_python-0.4.25}/environments/browser/apps/README.md +0 -0
  23. {hud_python-0.4.23 → hud_python-0.4.25}/environments/browser/apps/todo/README.md +0 -0
  24. {hud_python-0.4.23 → hud_python-0.4.25}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
  25. {hud_python-0.4.23 → hud_python-0.4.25}/environments/browser/pyproject.toml +0 -0
  26. {hud_python-0.4.23 → hud_python-0.4.25}/environments/remote_browser/README.md +0 -0
  27. {hud_python-0.4.23 → hud_python-0.4.25}/environments/remote_browser/pyproject.toml +0 -0
  28. {hud_python-0.4.23 → hud_python-0.4.25}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  29. {hud_python-0.4.23 → hud_python-0.4.25}/environments/text_2048/README.md +0 -0
  30. {hud_python-0.4.23 → hud_python-0.4.25}/environments/text_2048/pyproject.toml +0 -0
  31. {hud_python-0.4.23 → hud_python-0.4.25}/examples/README.md +0 -0
  32. {hud_python-0.4.23 → hud_python-0.4.25}/hud/__init__.py +0 -0
  33. {hud_python-0.4.23 → hud_python-0.4.25}/hud/__main__.py +0 -0
  34. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/__init__.py +0 -0
  35. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/grounded_openai.py +0 -0
  36. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/langchain.py +0 -0
  37. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/misc/__init__.py +0 -0
  38. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/misc/response_agent.py +0 -0
  39. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/openai.py +0 -0
  40. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/openai_chat_generic.py +0 -0
  41. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/tests/__init__.py +0 -0
  42. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/tests/test_base.py +0 -0
  43. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/tests/test_claude.py +0 -0
  44. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/tests/test_client.py +0 -0
  45. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  46. {hud_python-0.4.23 → hud_python-0.4.25}/hud/agents/tests/test_openai.py +0 -0
  47. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/__init__.py +0 -0
  48. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/__main__.py +0 -0
  49. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/analyze.py +0 -0
  50. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/build.py +0 -0
  51. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/clone.py +0 -0
  52. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/debug.py +0 -0
  53. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/dev.py +0 -0
  54. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/eval.py +0 -0
  55. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/hf.py +0 -0
  56. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/init.py +0 -0
  57. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/list_func.py +0 -0
  58. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/pull.py +0 -0
  59. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/push.py +0 -0
  60. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/remove.py +0 -0
  61. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/rl/README.md +0 -0
  62. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/rl/__init__.py +0 -0
  63. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/rl/init.py +0 -0
  64. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/rl/pod.py +0 -0
  65. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/rl/ssh.py +0 -0
  66. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/rl/train.py +0 -0
  67. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/rl/utils.py +0 -0
  68. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/__init__.py +0 -0
  69. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_analyze.py +0 -0
  70. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_analyze_metadata.py +0 -0
  71. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_build.py +0 -0
  72. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_cli_init.py +0 -0
  73. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_cli_main.py +0 -0
  74. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_clone.py +0 -0
  75. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_cursor.py +0 -0
  76. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_debug.py +0 -0
  77. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_list_func.py +0 -0
  78. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_main_module.py +0 -0
  79. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_mcp_server.py +0 -0
  80. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_pull.py +0 -0
  81. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_push.py +0 -0
  82. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_registry.py +0 -0
  83. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/tests/test_utils.py +0 -0
  84. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/utils/__init__.py +0 -0
  85. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/utils/cursor.py +0 -0
  86. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/utils/docker.py +0 -0
  87. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/utils/environment.py +0 -0
  88. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/utils/interactive.py +0 -0
  89. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/utils/logging.py +0 -0
  90. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/utils/metadata.py +0 -0
  91. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/utils/registry.py +0 -0
  92. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/utils/remote_runner.py +0 -0
  93. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/utils/runner.py +0 -0
  94. {hud_python-0.4.23 → hud_python-0.4.25}/hud/cli/utils/server.py +0 -0
  95. {hud_python-0.4.23 → hud_python-0.4.25}/hud/clients/README.md +0 -0
  96. {hud_python-0.4.23 → hud_python-0.4.25}/hud/clients/__init__.py +0 -0
  97. {hud_python-0.4.23 → hud_python-0.4.25}/hud/clients/base.py +0 -0
  98. {hud_python-0.4.23 → hud_python-0.4.25}/hud/clients/fastmcp.py +0 -0
  99. {hud_python-0.4.23 → hud_python-0.4.25}/hud/clients/tests/__init__.py +0 -0
  100. {hud_python-0.4.23 → hud_python-0.4.25}/hud/clients/tests/test_client_integration.py +0 -0
  101. {hud_python-0.4.23 → hud_python-0.4.25}/hud/clients/tests/test_fastmcp.py +0 -0
  102. {hud_python-0.4.23 → hud_python-0.4.25}/hud/clients/tests/test_protocol.py +0 -0
  103. {hud_python-0.4.23 → hud_python-0.4.25}/hud/clients/utils/retry_transport.py +0 -0
  104. {hud_python-0.4.23 → hud_python-0.4.25}/hud/datasets/__init__.py +0 -0
  105. {hud_python-0.4.23 → hud_python-0.4.25}/hud/datasets/execution/__init__.py +0 -0
  106. {hud_python-0.4.23 → hud_python-0.4.25}/hud/datasets/execution/runner.py +0 -0
  107. {hud_python-0.4.23 → hud_python-0.4.25}/hud/datasets/task.py +0 -0
  108. {hud_python-0.4.23 → hud_python-0.4.25}/hud/datasets/utils.py +0 -0
  109. {hud_python-0.4.23 → hud_python-0.4.25}/hud/misc/__init__.py +0 -0
  110. {hud_python-0.4.23 → hud_python-0.4.25}/hud/misc/claude_plays_pokemon.py +0 -0
  111. {hud_python-0.4.23 → hud_python-0.4.25}/hud/native/__init__.py +0 -0
  112. {hud_python-0.4.23 → hud_python-0.4.25}/hud/native/comparator.py +0 -0
  113. {hud_python-0.4.23 → hud_python-0.4.25}/hud/native/tests/__init__.py +0 -0
  114. {hud_python-0.4.23 → hud_python-0.4.25}/hud/native/tests/test_comparator.py +0 -0
  115. {hud_python-0.4.23 → hud_python-0.4.25}/hud/native/tests/test_native_init.py +0 -0
  116. {hud_python-0.4.23 → hud_python-0.4.25}/hud/otel/__init__.py +0 -0
  117. {hud_python-0.4.23 → hud_python-0.4.25}/hud/otel/collector.py +0 -0
  118. {hud_python-0.4.23 → hud_python-0.4.25}/hud/otel/context.py +0 -0
  119. {hud_python-0.4.23 → hud_python-0.4.25}/hud/otel/exporters.py +0 -0
  120. {hud_python-0.4.23 → hud_python-0.4.25}/hud/otel/instrumentation.py +0 -0
  121. {hud_python-0.4.23 → hud_python-0.4.25}/hud/otel/processors.py +0 -0
  122. {hud_python-0.4.23 → hud_python-0.4.25}/hud/otel/tests/__init__.py +0 -0
  123. {hud_python-0.4.23 → hud_python-0.4.25}/hud/otel/tests/test_processors.py +0 -0
  124. {hud_python-0.4.23 → hud_python-0.4.25}/hud/py.typed +0 -0
  125. {hud_python-0.4.23 → hud_python-0.4.25}/hud/server/__init__.py +0 -0
  126. {hud_python-0.4.23 → hud_python-0.4.25}/hud/server/context.py +0 -0
  127. {hud_python-0.4.23 → hud_python-0.4.25}/hud/server/helper/__init__.py +0 -0
  128. {hud_python-0.4.23 → hud_python-0.4.25}/hud/server/low_level.py +0 -0
  129. {hud_python-0.4.23 → hud_python-0.4.25}/hud/server/server.py +0 -0
  130. {hud_python-0.4.23 → hud_python-0.4.25}/hud/server/tests/__init__.py +0 -0
  131. {hud_python-0.4.23 → hud_python-0.4.25}/hud/settings.py +0 -0
  132. {hud_python-0.4.23 → hud_python-0.4.25}/hud/shared/__init__.py +0 -0
  133. {hud_python-0.4.23 → hud_python-0.4.25}/hud/shared/exceptions.py +0 -0
  134. {hud_python-0.4.23 → hud_python-0.4.25}/hud/shared/hints.py +0 -0
  135. {hud_python-0.4.23 → hud_python-0.4.25}/hud/shared/requests.py +0 -0
  136. {hud_python-0.4.23 → hud_python-0.4.25}/hud/shared/tests/__init__.py +0 -0
  137. {hud_python-0.4.23 → hud_python-0.4.25}/hud/shared/tests/test_exceptions.py +0 -0
  138. {hud_python-0.4.23 → hud_python-0.4.25}/hud/shared/tests/test_requests.py +0 -0
  139. {hud_python-0.4.23 → hud_python-0.4.25}/hud/telemetry/__init__.py +0 -0
  140. {hud_python-0.4.23 → hud_python-0.4.25}/hud/telemetry/instrument.py +0 -0
  141. {hud_python-0.4.23 → hud_python-0.4.25}/hud/telemetry/job.py +0 -0
  142. {hud_python-0.4.23 → hud_python-0.4.25}/hud/telemetry/replay.py +0 -0
  143. {hud_python-0.4.23 → hud_python-0.4.25}/hud/telemetry/tests/__init__.py +0 -0
  144. {hud_python-0.4.23 → hud_python-0.4.25}/hud/telemetry/tests/test_replay.py +0 -0
  145. {hud_python-0.4.23 → hud_python-0.4.25}/hud/telemetry/tests/test_trace.py +0 -0
  146. {hud_python-0.4.23 → hud_python-0.4.25}/hud/telemetry/trace.py +0 -0
  147. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/__init__.py +0 -0
  148. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/base.py +0 -0
  149. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/bash.py +0 -0
  150. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/computer/__init__.py +0 -0
  151. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/computer/anthropic.py +0 -0
  152. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/computer/hud.py +0 -0
  153. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/computer/openai.py +0 -0
  154. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/computer/settings.py +0 -0
  155. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/edit.py +0 -0
  156. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/executors/__init__.py +0 -0
  157. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/executors/base.py +0 -0
  158. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/executors/pyautogui.py +0 -0
  159. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/executors/tests/__init__.py +0 -0
  160. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/executors/tests/test_base_executor.py +0 -0
  161. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  162. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/executors/xdo.py +0 -0
  163. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/grounding/__init__.py +0 -0
  164. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/grounding/config.py +0 -0
  165. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/grounding/grounded_tool.py +0 -0
  166. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/grounding/grounder.py +0 -0
  167. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/grounding/tests/__init__.py +0 -0
  168. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  169. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/playwright.py +0 -0
  170. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/response.py +0 -0
  171. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/submit.py +0 -0
  172. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/__init__.py +0 -0
  173. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_base.py +0 -0
  174. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_bash.py +0 -0
  175. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_bash_extended.py +0 -0
  176. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_computer.py +0 -0
  177. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_computer_actions.py +0 -0
  178. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_edit.py +0 -0
  179. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_init.py +0 -0
  180. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_playwright_tool.py +0 -0
  181. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_response.py +0 -0
  182. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_tools.py +0 -0
  183. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_tools_init.py +0 -0
  184. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/tests/test_utils.py +0 -0
  185. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/types.py +0 -0
  186. {hud_python-0.4.23 → hud_python-0.4.25}/hud/tools/utils.py +0 -0
  187. {hud_python-0.4.23 → hud_python-0.4.25}/hud/types.py +0 -0
  188. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/__init__.py +0 -0
  189. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/agent_factories.py +0 -0
  190. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/async_utils.py +0 -0
  191. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/hud_console.py +0 -0
  192. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/mcp.py +0 -0
  193. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/pretty_errors.py +0 -0
  194. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/progress.py +0 -0
  195. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/telemetry.py +0 -0
  196. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/tests/__init__.py +0 -0
  197. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/tests/test_async_utils.py +0 -0
  198. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/tests/test_init.py +0 -0
  199. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/tests/test_mcp.py +0 -0
  200. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/tests/test_progress.py +0 -0
  201. {hud_python-0.4.23 → hud_python-0.4.25}/hud/utils/tests/test_telemetry.py +0 -0
  202. {hud_python-0.4.23 → hud_python-0.4.25}/rl/README.md +0 -0
  203. {hud_python-0.4.23 → hud_python-0.4.25}/rl/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.23
3
+ Version: 0.4.25
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -207,6 +207,7 @@ class MCPAgent(ABC):
207
207
  else:
208
208
  raise TypeError(f"prompt_or_task must be str or Task, got {type(prompt_or_task)}")
209
209
  except Exception as e:
210
+ # Always return a Trace object for any exception
210
211
  if self._is_connection_error(e):
211
212
  # Return error trace for connection failures
212
213
  return Trace(
@@ -215,7 +216,15 @@ class MCPAgent(ABC):
215
216
  content=self._get_connection_error_message(e),
216
217
  isError=True,
217
218
  )
218
- raise
219
+ else:
220
+ # Return error trace for any other exception
221
+ return Trace(
222
+ reward=0.0,
223
+ done=True,
224
+ content=f"Task failed with error: {e}",
225
+ isError=True,
226
+ info={"error": str(e)},
227
+ )
219
228
  finally:
220
229
  # Cleanup auto-created resources
221
230
  await self._cleanup()
@@ -262,34 +271,53 @@ class MCPAgent(ABC):
262
271
  prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True)
263
272
  prompt_result.populate_from_context()
264
273
 
265
- # Always evaluate if we have a prompt result and evaluate tool
266
- if prompt_result is not None and task.evaluate_tool is not None:
274
+ # Always evaluate if we have evaluate tool, regardless of errors
275
+ if task.evaluate_tool is not None:
267
276
  try:
268
277
  self.console.progress_log(f"Evaluating tool phase: {task.evaluate_tool}")
269
278
  results = await self.call_tools(task.evaluate_tool)
270
279
 
271
280
  if any(result.isError for result in results):
272
- raise RuntimeError(f"{results}")
273
-
274
- # Extract reward and content from evaluation
275
- if results:
276
- reward = find_reward(results[0])
277
- eval_content = find_content(results[0])
278
-
279
- # Update the prompt result with evaluation reward
280
- prompt_result.reward = reward
281
-
282
- # Update the prompt result with evaluation content (if available)
283
- if eval_content:
284
- # Prompt result may already have final response content, so we append to it
285
- if prompt_result.content:
286
- prompt_result.content += "\n\n" + eval_content
281
+ self.console.warning_log(f"Evaluate tool returned error: {results}")
282
+ # Still extract what we can from the error response
283
+ if prompt_result is None:
284
+ prompt_result = Trace(
285
+ reward=0.0,
286
+ done=True,
287
+ content="Task failed before evaluation",
288
+ isError=True,
289
+ )
290
+ prompt_result.reward = 0.0 # Default to 0 on error
291
+ else:
292
+ # Extract reward and content from evaluation
293
+ if results:
294
+ reward = find_reward(results[0])
295
+ eval_content = find_content(results[0])
296
+
297
+ # Update the prompt result with evaluation reward
298
+ if prompt_result is None:
299
+ prompt_result = Trace(
300
+ reward=reward, done=True, content=eval_content or "", isError=False
301
+ )
287
302
  else:
288
- prompt_result.content = eval_content
303
+ prompt_result.reward = reward
304
+
305
+ # Update the prompt result with evaluation content (if available)
306
+ if eval_content:
307
+ # Prompt result may already have final response content,
308
+ # so we append to it
309
+ if prompt_result.content:
310
+ prompt_result.content += "\n\n" + eval_content
311
+ else:
312
+ prompt_result.content = eval_content
289
313
 
290
314
  except Exception as e:
291
315
  self.console.error_log(f"Evaluation phase failed: {e}")
292
- # Continue with the prompt result even if evaluation failed
316
+ # Ensure we have a result even if evaluation failed
317
+ if prompt_result is None:
318
+ prompt_result = Trace(
319
+ reward=0.0, done=True, content=f"Evaluation failed: {e}", isError=True
320
+ )
293
321
 
294
322
  return (
295
323
  prompt_result
@@ -196,7 +196,11 @@ class ClaudeAgent(MCPAgent):
196
196
  response = await self.anthropic_client.beta.messages.create(**create_kwargs)
197
197
  break
198
198
  except BadRequestError as e:
199
- if e.message.startswith("prompt is too long"):
199
+ if (
200
+ "prompt is too long" in str(e)
201
+ or "request_too_large" in str(e)
202
+ or e.status_code == 413
203
+ ):
200
204
  logger.warning("Prompt too long, truncating message history")
201
205
  # Keep first message and last 20 messages
202
206
  if len(current_messages) > 21:
@@ -15,6 +15,7 @@ from hud.types import MCPToolCall, MCPToolResult
15
15
  from hud.version import __version__ as hud_version
16
16
 
17
17
  from .base import BaseHUDClient
18
+ from .utils.mcp_use_retry import patch_all_sessions
18
19
 
19
20
  logger = logging.getLogger(__name__)
20
21
 
@@ -63,6 +64,10 @@ class MCPUseHUDClient(BaseHUDClient):
63
64
  self._sessions = await self._client.create_all_sessions()
64
65
  logger.info("Created %d MCP sessions", len(self._sessions))
65
66
 
67
+ # Patch all sessions with retry logic
68
+ patch_all_sessions(self._sessions)
69
+ logger.debug("Applied retry logic to all MCP sessions")
70
+
66
71
  # Configure validation for all sessions based on client setting
67
72
  try:
68
73
  for session in self._sessions.values():
@@ -127,7 +132,7 @@ class MCPUseHUDClient(BaseHUDClient):
127
132
  logger.warning("Client session not initialized for %s", server_name)
128
133
  continue
129
134
 
130
- # List tools
135
+ # List tools (retry logic is handled at transport level)
131
136
  tools_result = await session.connector.client_session.list_tools()
132
137
 
133
138
  logger.info(
@@ -202,6 +207,7 @@ class MCPUseHUDClient(BaseHUDClient):
202
207
  if session.connector.client_session is None:
203
208
  raise ValueError(f"Client session not initialized for {server_name}")
204
209
 
210
+ # Call tool (retry logic is handled at transport level)
205
211
  result = await session.connector.client_session.call_tool(
206
212
  name=original_tool.name, # Use original tool name, not prefixed
207
213
  arguments=tool_call.arguments or {},
@@ -232,6 +238,7 @@ class MCPUseHUDClient(BaseHUDClient):
232
238
  continue
233
239
  # Prefer standard method name if available
234
240
  if hasattr(session.connector.client_session, "list_resources"):
241
+ # List resources (retry logic is handled at transport level)
235
242
  resources = await session.connector.client_session.list_resources()
236
243
  else:
237
244
  # If the client doesn't support resource listing, skip
@@ -262,6 +269,7 @@ class MCPUseHUDClient(BaseHUDClient):
262
269
  resource_uri = AnyUrl(uri) if isinstance(uri, str) else uri
263
270
  # Prefer read_resource; fall back to list_resources if needed
264
271
  if hasattr(session.connector.client_session, "read_resource"):
272
+ # Read resource (retry logic is handled at transport level)
265
273
  result = await session.connector.client_session.read_resource(resource_uri)
266
274
  else:
267
275
  # Fallback path for older clients: not supported in strict typing
@@ -0,0 +1,378 @@
1
+ """Tests for MCP-use client retry functionality."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from unittest.mock import AsyncMock, Mock, patch
6
+
7
+ import pytest
8
+ import requests
9
+ from mcp import types
10
+
11
+ from hud.clients.mcp_use import MCPUseHUDClient
12
+ from hud.clients.utils.mcp_use_retry import (
13
+ create_async_retry_wrapper,
14
+ create_retry_session,
15
+ patch_all_sessions,
16
+ patch_mcp_session_http_client,
17
+ )
18
+ from hud.types import MCPToolCall
19
+
20
+
21
+ class TestRetrySession:
22
+ """Test the retry session creation."""
23
+
24
+ def test_create_retry_session(self):
25
+ """Test that retry session is configured correctly."""
26
+ session = create_retry_session(
27
+ max_retries=5,
28
+ retry_status_codes=(500, 502, 503, 504),
29
+ retry_delay=0.5,
30
+ backoff_factor=2.0,
31
+ )
32
+
33
+ # Check that session has adapters mounted
34
+ assert "http://" in session.adapters
35
+ assert "https://" in session.adapters
36
+
37
+ # Check adapter configuration
38
+ adapter = session.adapters["http://"]
39
+ assert adapter.max_retries.total == 5
40
+ assert 500 in adapter.max_retries.status_forcelist
41
+ assert 502 in adapter.max_retries.status_forcelist
42
+ assert adapter.max_retries.backoff_factor == 2.0
43
+
44
+ def test_retry_session_default_values(self):
45
+ """Test retry session with default values."""
46
+ session = create_retry_session()
47
+
48
+ adapter = session.adapters["https://"]
49
+ assert adapter.max_retries.total == 3
50
+ assert 502 in adapter.max_retries.status_forcelist
51
+ assert 503 in adapter.max_retries.status_forcelist
52
+ assert 504 in adapter.max_retries.status_forcelist
53
+
54
+
55
+ class TestAsyncRetryWrapper:
56
+ """Test the async retry wrapper functionality."""
57
+
58
+ @pytest.mark.asyncio
59
+ async def test_retry_on_error_status_codes(self):
60
+ """Test that async wrapper retries on specific status codes."""
61
+ call_count = 0
62
+
63
+ async def mock_func(*args, **kwargs):
64
+ nonlocal call_count
65
+ call_count += 1
66
+
67
+ # First two calls fail, third succeeds
68
+ if call_count < 3:
69
+ result = Mock()
70
+ result.status_code = 503 # Service unavailable
71
+ return result
72
+
73
+ result = Mock()
74
+ result.status_code = 200
75
+ return result
76
+
77
+ wrapped = create_async_retry_wrapper(
78
+ mock_func,
79
+ max_retries=3,
80
+ retry_status_codes=(503,),
81
+ retry_delay=0.01, # Short delay for testing
82
+ )
83
+
84
+ result = await wrapped()
85
+ assert call_count == 3
86
+ assert result.status_code == 200
87
+
88
+ @pytest.mark.asyncio
89
+ async def test_retry_on_exception(self):
90
+ """Test that async wrapper retries on exceptions with status codes."""
91
+ call_count = 0
92
+
93
+ async def mock_func(*args, **kwargs):
94
+ nonlocal call_count
95
+ call_count += 1
96
+
97
+ if call_count < 3:
98
+ raise Exception("HTTP 503 Service Unavailable")
99
+
100
+ return Mock(status_code=200)
101
+
102
+ wrapped = create_async_retry_wrapper(
103
+ mock_func,
104
+ max_retries=3,
105
+ retry_status_codes=(503,),
106
+ retry_delay=0.01,
107
+ )
108
+
109
+ result = await wrapped()
110
+ assert call_count == 3
111
+ assert result.status_code == 200
112
+
113
+ @pytest.mark.asyncio
114
+ async def test_no_retry_on_success(self):
115
+ """Test that successful calls don't trigger retries."""
116
+ call_count = 0
117
+
118
+ async def mock_func(*args, **kwargs):
119
+ nonlocal call_count
120
+ call_count += 1
121
+ return Mock(status_code=200)
122
+
123
+ wrapped = create_async_retry_wrapper(mock_func)
124
+
125
+ result = await wrapped()
126
+ assert call_count == 1
127
+ assert result.status_code == 200
128
+
129
+ @pytest.mark.asyncio
130
+ async def test_max_retries_exceeded(self):
131
+ """Test that retries stop after max attempts."""
132
+ call_count = 0
133
+
134
+ async def mock_func(*args, **kwargs):
135
+ nonlocal call_count
136
+ call_count += 1
137
+ raise Exception("HTTP 503 Service Unavailable")
138
+
139
+ wrapped = create_async_retry_wrapper(
140
+ mock_func,
141
+ max_retries=2,
142
+ retry_status_codes=(503,),
143
+ retry_delay=0.01,
144
+ )
145
+
146
+ with pytest.raises(Exception) as exc_info:
147
+ await wrapped()
148
+
149
+ assert "503" in str(exc_info.value)
150
+ assert call_count == 3 # Initial + 2 retries
151
+
152
+
153
+ class TestSessionPatching:
154
+ """Test the session patching functionality."""
155
+
156
+ def test_patch_sync_session(self):
157
+ """Test patching a synchronous session."""
158
+ # Create mock session with connector
159
+ mock_session = Mock()
160
+ mock_session.connector = Mock()
161
+ mock_session.connector._connection_manager = Mock()
162
+ mock_session.connector._connection_manager._session = requests.Session()
163
+
164
+ # Patch the session
165
+ patch_mcp_session_http_client(mock_session)
166
+
167
+ # Verify the session was replaced with retry-enabled one
168
+ patched_session = mock_session.connector._connection_manager._session
169
+ assert "http://" in patched_session.adapters
170
+ assert "https://" in patched_session.adapters
171
+
172
+ # Check that it has retry configuration
173
+ adapter = patched_session.adapters["http://"]
174
+ assert hasattr(adapter, "max_retries")
175
+
176
+ @pytest.mark.asyncio
177
+ async def test_patch_async_session(self):
178
+ """Test patching an async session."""
179
+ # Create mock async session
180
+ mock_session = Mock()
181
+ mock_session.connector = Mock()
182
+ mock_session.connector.client_session = Mock()
183
+
184
+ async def mock_send_request(*args, **kwargs):
185
+ return Mock(status_code=200)
186
+
187
+ mock_session.connector.client_session._send_request = mock_send_request
188
+
189
+ # Patch the session
190
+ patch_mcp_session_http_client(mock_session)
191
+
192
+ # Verify _send_request was wrapped
193
+ wrapped_func = mock_session.connector.client_session._send_request
194
+ assert wrapped_func != mock_send_request # Function was replaced
195
+
196
+ # Test that wrapped function still works
197
+ result = await wrapped_func()
198
+ assert result.status_code == 200
199
+
200
+ def test_patch_all_sessions(self):
201
+ """Test patching multiple sessions."""
202
+ # Create mock sessions
203
+ session1 = Mock()
204
+ session1.connector = Mock()
205
+ session1.connector._connection_manager = Mock()
206
+ session1.connector._connection_manager.session = requests.Session()
207
+
208
+ session2 = Mock()
209
+ session2.connector = Mock()
210
+ session2.connector.client_session = Mock()
211
+ session2.connector.client_session._send_request = AsyncMock()
212
+
213
+ sessions = {"server1": session1, "server2": session2}
214
+
215
+ # Patch all sessions
216
+ patch_all_sessions(sessions)
217
+
218
+ # Verify both were patched
219
+ assert "http://" in session1.connector._connection_manager.session.adapters
220
+ assert session2.connector.client_session._send_request != AsyncMock
221
+
222
+
223
+ class TestMCPUseClientRetry:
224
+ """Test retry functionality integrated into MCPUseHUDClient."""
225
+
226
+ @pytest.mark.asyncio
227
+ async def test_client_applies_retry_on_connect(self):
228
+ """Test that MCPUseHUDClient applies retry logic during connection."""
229
+ config = {"test_server": {"url": "http://localhost:8080"}}
230
+ client = MCPUseHUDClient(config)
231
+
232
+ # Mock the MCPUseClient and session creation
233
+ with patch("hud.clients.mcp_use.MCPUseClient") as MockMCPUseClient:
234
+ mock_client = Mock()
235
+ MockMCPUseClient.from_dict.return_value = mock_client
236
+
237
+ # Create mock session
238
+ mock_session = Mock()
239
+ mock_session.connector = Mock()
240
+ mock_session.connector.client_session = Mock()
241
+ mock_session.connector.client_session._send_request = AsyncMock()
242
+ mock_session.connector.client_session.list_tools = AsyncMock(
243
+ return_value=Mock(tools=[])
244
+ )
245
+
246
+ mock_client.create_all_sessions = AsyncMock(return_value={"test_server": mock_session})
247
+
248
+ # Initialize client (which applies retry logic)
249
+ await client.initialize()
250
+
251
+ # Verify session was created and patched
252
+ assert len(client._sessions) == 1
253
+ assert "test_server" in client._sessions
254
+
255
+ @pytest.mark.asyncio
256
+ async def test_tool_call_with_retry(self):
257
+ """Test that tool calls work with retry logic."""
258
+ config = {"test_server": {"url": "http://localhost:8080"}}
259
+ client = MCPUseHUDClient(config)
260
+
261
+ with patch("hud.clients.mcp_use.MCPUseClient") as MockMCPUseClient:
262
+ mock_client = Mock()
263
+ MockMCPUseClient.from_dict.return_value = mock_client
264
+
265
+ # Create mock session
266
+ mock_session = Mock()
267
+ mock_session.connector = Mock()
268
+ mock_session.connector.client_session = Mock()
269
+
270
+ # Mock tool listing
271
+ test_tool = types.Tool(
272
+ name="test_tool",
273
+ description="Test tool",
274
+ inputSchema={"type": "object"},
275
+ )
276
+ mock_session.connector.client_session.list_tools = AsyncMock(
277
+ return_value=Mock(tools=[test_tool])
278
+ )
279
+
280
+ # Mock tool call with simulated retry
281
+ call_count = 0
282
+
283
+ async def mock_call_tool(name, arguments):
284
+ nonlocal call_count
285
+ call_count += 1
286
+
287
+ # First call fails, second succeeds
288
+ if call_count == 1:
289
+ raise Exception("HTTP 503 Service Unavailable")
290
+
291
+ return Mock(
292
+ content=[types.TextContent(type="text", text="Success")],
293
+ isError=False,
294
+ structuredContent=None,
295
+ )
296
+
297
+ mock_session.connector.client_session.call_tool = mock_call_tool
298
+ mock_session.connector.client_session._send_request = AsyncMock()
299
+
300
+ mock_client.create_all_sessions = AsyncMock(return_value={"test_server": mock_session})
301
+
302
+ # Initialize and call tool
303
+ await client.initialize()
304
+
305
+ # Wrap call_tool with retry for this test
306
+ original_call = mock_session.connector.client_session.call_tool
307
+ mock_session.connector.client_session.call_tool = create_async_retry_wrapper(
308
+ original_call,
309
+ max_retries=2,
310
+ retry_status_codes=(503,),
311
+ retry_delay=0.01,
312
+ )
313
+
314
+ result = await client.call_tool(MCPToolCall(name="test_tool", arguments={}))
315
+
316
+ # Verify retry worked
317
+ assert call_count == 2 # Failed once, then succeeded
318
+ assert not result.isError
319
+ assert result.content[0].text == "Success"
320
+
321
+ @pytest.mark.asyncio
322
+ async def test_resource_read_with_retry(self):
323
+ """Test that resource reading works with retry logic."""
324
+ config = {"test_server": {"url": "http://localhost:8080"}}
325
+ client = MCPUseHUDClient(config)
326
+
327
+ with patch("hud.clients.mcp_use.MCPUseClient") as MockMCPUseClient:
328
+ mock_client = Mock()
329
+ MockMCPUseClient.from_dict.return_value = mock_client
330
+
331
+ # Create mock session
332
+ mock_session = Mock()
333
+ mock_session.connector = Mock()
334
+ mock_session.connector.client_session = Mock()
335
+ mock_session.connector.client_session.list_tools = AsyncMock(
336
+ return_value=Mock(tools=[])
337
+ )
338
+
339
+ # Mock resource read with simulated retry
340
+ call_count = 0
341
+
342
+ async def mock_read_resource(uri):
343
+ nonlocal call_count
344
+ call_count += 1
345
+
346
+ # First call fails, second succeeds
347
+ if call_count == 1:
348
+ raise Exception("HTTP 502 Bad Gateway")
349
+
350
+ return Mock(contents=[Mock(text='{"status": "ok"}')])
351
+
352
+ mock_session.connector.client_session.read_resource = mock_read_resource
353
+ mock_session.connector.client_session._send_request = AsyncMock()
354
+
355
+ mock_client.create_all_sessions = AsyncMock(return_value={"test_server": mock_session})
356
+
357
+ # Initialize
358
+ await client.initialize()
359
+
360
+ # Wrap read_resource with retry for this test
361
+ original_read = mock_session.connector.client_session.read_resource
362
+ mock_session.connector.client_session.read_resource = create_async_retry_wrapper(
363
+ original_read,
364
+ max_retries=2,
365
+ retry_status_codes=(502,),
366
+ retry_delay=0.01,
367
+ )
368
+
369
+ result = await client.read_resource("test://resource")
370
+
371
+ # Verify retry worked
372
+ assert call_count == 2 # Failed once, then succeeded
373
+ assert result is not None
374
+ assert result.contents[0].text == '{"status": "ok"}'
375
+
376
+
377
+ if __name__ == "__main__":
378
+ pytest.main([__file__, "-v"])
@@ -0,0 +1,26 @@
1
+ """HUD MCP client utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .retry import (
6
+ DEFAULT_BACKOFF_FACTOR,
7
+ DEFAULT_MAX_RETRIES,
8
+ DEFAULT_RETRY_DELAY,
9
+ DEFAULT_RETRY_STATUS_CODES,
10
+ is_retryable_error,
11
+ retry_with_backoff,
12
+ with_retry,
13
+ )
14
+ from .retry_transport import RetryTransport, create_retry_httpx_client
15
+
16
+ __all__ = [
17
+ "DEFAULT_BACKOFF_FACTOR",
18
+ "DEFAULT_MAX_RETRIES",
19
+ "DEFAULT_RETRY_DELAY",
20
+ "DEFAULT_RETRY_STATUS_CODES",
21
+ "RetryTransport",
22
+ "create_retry_httpx_client",
23
+ "is_retryable_error",
24
+ "retry_with_backoff",
25
+ "with_retry",
26
+ ]