hud-python 0.4.63__tar.gz → 0.4.64__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. {hud_python-0.4.63 → hud_python-0.4.64}/PKG-INFO +3 -3
  2. {hud_python-0.4.63 → hud_python-0.4.64}/README.md +2 -2
  3. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/base.py +6 -4
  4. hud_python-0.4.64/hud/agents/claude.py +365 -0
  5. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/langchain.py +4 -1
  6. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/openai_chat_generic.py +4 -1
  7. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_base.py +0 -1
  8. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/eval.py +3 -3
  9. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_eval.py +93 -25
  10. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_version.py +1 -1
  11. {hud_python-0.4.63 → hud_python-0.4.64}/hud/version.py +1 -1
  12. {hud_python-0.4.63 → hud_python-0.4.64}/pyproject.toml +1 -1
  13. hud_python-0.4.63/hud/agents/claude.py +0 -419
  14. {hud_python-0.4.63 → hud_python-0.4.64}/.gitignore +0 -0
  15. {hud_python-0.4.63 → hud_python-0.4.64}/LICENSE +0 -0
  16. {hud_python-0.4.63 → hud_python-0.4.64}/environments/README.md +0 -0
  17. {hud_python-0.4.63 → hud_python-0.4.64}/environments/blank/README.md +0 -0
  18. {hud_python-0.4.63 → hud_python-0.4.64}/environments/blank/environment/README.md +0 -0
  19. {hud_python-0.4.63 → hud_python-0.4.64}/environments/blank/environment/pyproject.toml +0 -0
  20. {hud_python-0.4.63 → hud_python-0.4.64}/environments/blank/server/README.md +0 -0
  21. {hud_python-0.4.63 → hud_python-0.4.64}/environments/blank/server/pyproject.toml +0 -0
  22. {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/README.md +0 -0
  23. {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/browser-base/README.md +0 -0
  24. {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/2048/README.md +0 -0
  25. {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
  26. {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/README.md +0 -0
  27. {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/pyproject.toml +0 -0
  28. {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/todo/README.md +0 -0
  29. {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
  30. {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/pyproject.toml +0 -0
  31. {hud_python-0.4.63 → hud_python-0.4.64}/environments/browser/server/pyproject.toml +0 -0
  32. {hud_python-0.4.63 → hud_python-0.4.64}/environments/deepresearch/README.md +0 -0
  33. {hud_python-0.4.63 → hud_python-0.4.64}/environments/deepresearch/environment/pyproject.toml +0 -0
  34. {hud_python-0.4.63 → hud_python-0.4.64}/environments/deepresearch/pyproject.toml +0 -0
  35. {hud_python-0.4.63 → hud_python-0.4.64}/environments/deepresearch/server/pyproject.toml +0 -0
  36. {hud_python-0.4.63 → hud_python-0.4.64}/environments/jupyter/README.md +0 -0
  37. {hud_python-0.4.63 → hud_python-0.4.64}/environments/jupyter/server/pyproject.toml +0 -0
  38. {hud_python-0.4.63 → hud_python-0.4.64}/environments/online_mind2web/README.md +0 -0
  39. {hud_python-0.4.63 → hud_python-0.4.64}/environments/online_mind2web/pyproject.toml +0 -0
  40. {hud_python-0.4.63 → hud_python-0.4.64}/environments/online_mind2web/src/hud_controller/providers/README.md +0 -0
  41. {hud_python-0.4.63 → hud_python-0.4.64}/environments/remote_browser/README.md +0 -0
  42. {hud_python-0.4.63 → hud_python-0.4.64}/environments/remote_browser/pyproject.toml +0 -0
  43. {hud_python-0.4.63 → hud_python-0.4.64}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  44. {hud_python-0.4.63 → hud_python-0.4.64}/environments/rubrics/README.md +0 -0
  45. {hud_python-0.4.63 → hud_python-0.4.64}/environments/rubrics/environment/pyproject.toml +0 -0
  46. {hud_python-0.4.63 → hud_python-0.4.64}/environments/rubrics/pyproject.toml +0 -0
  47. {hud_python-0.4.63 → hud_python-0.4.64}/environments/rubrics/server/pyproject.toml +0 -0
  48. {hud_python-0.4.63 → hud_python-0.4.64}/environments/text_2048/README.md +0 -0
  49. {hud_python-0.4.63 → hud_python-0.4.64}/environments/text_2048/pyproject.toml +0 -0
  50. {hud_python-0.4.63 → hud_python-0.4.64}/examples/README.md +0 -0
  51. {hud_python-0.4.63 → hud_python-0.4.64}/hud/__init__.py +0 -0
  52. {hud_python-0.4.63 → hud_python-0.4.64}/hud/__main__.py +0 -0
  53. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/__init__.py +0 -0
  54. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/gemini.py +0 -0
  55. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/grounded_openai.py +0 -0
  56. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/lite_llm.py +0 -0
  57. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/misc/__init__.py +0 -0
  58. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/misc/integration_test_agent.py +0 -0
  59. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/misc/response_agent.py +0 -0
  60. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/openai.py +0 -0
  61. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/__init__.py +0 -0
  62. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_base_runtime.py +0 -0
  63. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_claude.py +0 -0
  64. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_client.py +0 -0
  65. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_gemini.py +0 -0
  66. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  67. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_openai.py +0 -0
  68. {hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/utils.py +0 -0
  69. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/__init__.py +0 -0
  70. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/__main__.py +0 -0
  71. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/analyze.py +0 -0
  72. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/build.py +0 -0
  73. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/clone.py +0 -0
  74. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/debug.py +0 -0
  75. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/dev.py +0 -0
  76. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/eval_config.py +0 -0
  77. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/flows/__init__.py +0 -0
  78. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/flows/dev.py +0 -0
  79. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/flows/tasks.py +0 -0
  80. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/get.py +0 -0
  81. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/init.py +0 -0
  82. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/list_func.py +0 -0
  83. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/pull.py +0 -0
  84. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/push.py +0 -0
  85. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/remove.py +0 -0
  86. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/__init__.py +0 -0
  87. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/celebrate.py +0 -0
  88. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/config.py +0 -0
  89. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/display.py +0 -0
  90. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/gpu.py +0 -0
  91. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/gpu_utils.py +0 -0
  92. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/local_runner.py +0 -0
  93. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/presets.py +0 -0
  94. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/remote_runner.py +0 -0
  95. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/rl_api.py +0 -0
  96. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/viewer.py +0 -0
  97. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/vllm.py +0 -0
  98. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/rl/wait_utils.py +0 -0
  99. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/__init__.py +0 -0
  100. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_analyze.py +0 -0
  101. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_analyze_metadata.py +0 -0
  102. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_analyze_module.py +0 -0
  103. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_build.py +0 -0
  104. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_build_failure.py +0 -0
  105. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_build_module.py +0 -0
  106. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_cli_init.py +0 -0
  107. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_cli_main.py +0 -0
  108. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
  109. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_cli_root.py +0 -0
  110. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_clone.py +0 -0
  111. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_convert.py +0 -0
  112. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_cursor.py +0 -0
  113. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_debug.py +0 -0
  114. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_list_func.py +0 -0
  115. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_main_module.py +0 -0
  116. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_mcp_server.py +0 -0
  117. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_pull.py +0 -0
  118. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_push.py +0 -0
  119. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_push_happy.py +0 -0
  120. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_push_wrapper.py +0 -0
  121. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_registry.py +0 -0
  122. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/tests/test_utils.py +0 -0
  123. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/__init__.py +0 -0
  124. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/config.py +0 -0
  125. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/cursor.py +0 -0
  126. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/docker.py +0 -0
  127. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/env_check.py +0 -0
  128. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/environment.py +0 -0
  129. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/interactive.py +0 -0
  130. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/local_runner.py +0 -0
  131. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/logging.py +0 -0
  132. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/metadata.py +0 -0
  133. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/package_runner.py +0 -0
  134. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/registry.py +0 -0
  135. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/remote_runner.py +0 -0
  136. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/runner.py +0 -0
  137. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/server.py +0 -0
  138. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/source_hash.py +0 -0
  139. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tasks.py +0 -0
  140. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/__init__.py +0 -0
  141. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_config.py +0 -0
  142. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_docker.py +0 -0
  143. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_docker_hints.py +0 -0
  144. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_env_check.py +0 -0
  145. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_environment.py +0 -0
  146. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_interactive_module.py +0 -0
  147. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_local_runner.py +0 -0
  148. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_logging_utils.py +0 -0
  149. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_metadata.py +0 -0
  150. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_package_runner.py +0 -0
  151. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_registry_utils.py +0 -0
  152. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_remote_runner.py +0 -0
  153. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_runner_modules.py +0 -0
  154. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_source_hash.py +0 -0
  155. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/tests/test_tasks.py +0 -0
  156. {hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/utils/version_check.py +0 -0
  157. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/README.md +0 -0
  158. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/__init__.py +0 -0
  159. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/base.py +0 -0
  160. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/fastmcp.py +0 -0
  161. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/mcp_use.py +0 -0
  162. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/tests/__init__.py +0 -0
  163. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/tests/test_client_integration.py +0 -0
  164. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/tests/test_fastmcp.py +0 -0
  165. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  166. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/tests/test_protocol.py +0 -0
  167. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/utils/__init__.py +0 -0
  168. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/utils/mcp_use_retry.py +0 -0
  169. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/utils/retry.py +0 -0
  170. {hud_python-0.4.63 → hud_python-0.4.64}/hud/clients/utils/retry_transport.py +0 -0
  171. {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/__init__.py +0 -0
  172. {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/parallel.py +0 -0
  173. {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/runner.py +0 -0
  174. {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/tests/__init__.py +0 -0
  175. {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/tests/test_runner.py +0 -0
  176. {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/tests/test_utils.py +0 -0
  177. {hud_python-0.4.63 → hud_python-0.4.64}/hud/datasets/utils.py +0 -0
  178. {hud_python-0.4.63 → hud_python-0.4.64}/hud/misc/__init__.py +0 -0
  179. {hud_python-0.4.63 → hud_python-0.4.64}/hud/misc/claude_plays_pokemon.py +0 -0
  180. {hud_python-0.4.63 → hud_python-0.4.64}/hud/native/__init__.py +0 -0
  181. {hud_python-0.4.63 → hud_python-0.4.64}/hud/native/comparator.py +0 -0
  182. {hud_python-0.4.63 → hud_python-0.4.64}/hud/native/tests/__init__.py +0 -0
  183. {hud_python-0.4.63 → hud_python-0.4.64}/hud/native/tests/test_comparator.py +0 -0
  184. {hud_python-0.4.63 → hud_python-0.4.64}/hud/native/tests/test_native_init.py +0 -0
  185. {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/__init__.py +0 -0
  186. {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/collector.py +0 -0
  187. {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/config.py +0 -0
  188. {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/context.py +0 -0
  189. {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/exporters.py +0 -0
  190. {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/instrumentation.py +0 -0
  191. {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/processors.py +0 -0
  192. {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/tests/__init__.py +0 -0
  193. {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/tests/test_instrumentation.py +0 -0
  194. {hud_python-0.4.63 → hud_python-0.4.64}/hud/otel/tests/test_processors.py +0 -0
  195. {hud_python-0.4.63 → hud_python-0.4.64}/hud/py.typed +0 -0
  196. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/README.md +0 -0
  197. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/__init__.py +0 -0
  198. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/actor.py +0 -0
  199. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/buffer.py +0 -0
  200. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/chat_template.jinja +0 -0
  201. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/config.py +0 -0
  202. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/distributed.py +0 -0
  203. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/learner.py +0 -0
  204. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/tests/__init__.py +0 -0
  205. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/tests/test_learner.py +0 -0
  206. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/train.py +0 -0
  207. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/types.py +0 -0
  208. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/utils/start_vllm_server.sh +0 -0
  209. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/utils.py +0 -0
  210. {hud_python-0.4.63 → hud_python-0.4.64}/hud/rl/vllm_adapter.py +0 -0
  211. {hud_python-0.4.63 → hud_python-0.4.64}/hud/samples/__init__.py +0 -0
  212. {hud_python-0.4.63 → hud_python-0.4.64}/hud/samples/browser.py +0 -0
  213. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/__init__.py +0 -0
  214. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/context.py +0 -0
  215. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/helper/__init__.py +0 -0
  216. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/low_level.py +0 -0
  217. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/router.py +0 -0
  218. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/server.py +0 -0
  219. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/__init__.py +0 -0
  220. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_add_tool.py +0 -0
  221. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_context.py +0 -0
  222. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_mcp_server_handlers.py +0 -0
  223. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_mcp_server_integration.py +0 -0
  224. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_mcp_server_more.py +0 -0
  225. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_run_wrapper.py +0 -0
  226. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_server_extra.py +0 -0
  227. {hud_python-0.4.63 → hud_python-0.4.64}/hud/server/tests/test_sigterm_runner.py +0 -0
  228. {hud_python-0.4.63 → hud_python-0.4.64}/hud/settings.py +0 -0
  229. {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/__init__.py +0 -0
  230. {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/exceptions.py +0 -0
  231. {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/hints.py +0 -0
  232. {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/requests.py +0 -0
  233. {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/tests/__init__.py +0 -0
  234. {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/tests/test_exceptions.py +0 -0
  235. {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/tests/test_hints.py +0 -0
  236. {hud_python-0.4.63 → hud_python-0.4.64}/hud/shared/tests/test_requests.py +0 -0
  237. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/__init__.py +0 -0
  238. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/async_context.py +0 -0
  239. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/instrument.py +0 -0
  240. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/job.py +0 -0
  241. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/replay.py +0 -0
  242. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/__init__.py +0 -0
  243. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/test_async_context.py +0 -0
  244. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/test_instrument.py +0 -0
  245. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/test_job.py +0 -0
  246. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/test_replay.py +0 -0
  247. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/tests/test_trace.py +0 -0
  248. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/trace.py +0 -0
  249. {hud_python-0.4.63 → hud_python-0.4.64}/hud/telemetry/utils.py +0 -0
  250. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/__init__.py +0 -0
  251. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/base.py +0 -0
  252. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/bash.py +0 -0
  253. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/__init__.py +0 -0
  254. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/anthropic.py +0 -0
  255. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/gemini.py +0 -0
  256. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/hud.py +0 -0
  257. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/openai.py +0 -0
  258. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/qwen.py +0 -0
  259. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/computer/settings.py +0 -0
  260. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/edit.py +0 -0
  261. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/__init__.py +0 -0
  262. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/base.py +0 -0
  263. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/pyautogui.py +0 -0
  264. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/tests/__init__.py +0 -0
  265. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/tests/test_base_executor.py +0 -0
  266. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  267. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/executors/xdo.py +0 -0
  268. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/__init__.py +0 -0
  269. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/config.py +0 -0
  270. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/grounded_tool.py +0 -0
  271. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/grounder.py +0 -0
  272. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/tests/__init__.py +0 -0
  273. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  274. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/jupyter.py +0 -0
  275. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/playwright.py +0 -0
  276. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/response.py +0 -0
  277. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/submit.py +0 -0
  278. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/__init__.py +0 -0
  279. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_base.py +0 -0
  280. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_bash.py +0 -0
  281. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_bash_extended.py +0 -0
  282. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_computer.py +0 -0
  283. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_computer_actions.py +0 -0
  284. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_edit.py +0 -0
  285. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_init.py +0 -0
  286. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_jupyter_tool.py +0 -0
  287. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_playwright_tool.py +0 -0
  288. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_response.py +0 -0
  289. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_submit.py +0 -0
  290. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_tools.py +0 -0
  291. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_tools_init.py +0 -0
  292. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_types.py +0 -0
  293. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/tests/test_utils.py +0 -0
  294. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/types.py +0 -0
  295. {hud_python-0.4.63 → hud_python-0.4.64}/hud/tools/utils.py +0 -0
  296. {hud_python-0.4.63 → hud_python-0.4.64}/hud/types.py +0 -0
  297. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/__init__.py +0 -0
  298. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/agent_factories.py +0 -0
  299. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/async_utils.py +0 -0
  300. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/group_eval.py +0 -0
  301. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/hud_console.py +0 -0
  302. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/mcp.py +0 -0
  303. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/pretty_errors.py +0 -0
  304. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/progress.py +0 -0
  305. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/task_tracking.py +0 -0
  306. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tasks.py +0 -0
  307. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/telemetry.py +0 -0
  308. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/__init__.py +0 -0
  309. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_agent_factories.py +0 -0
  310. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_async_utils.py +0 -0
  311. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_init.py +0 -0
  312. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_mcp.py +0 -0
  313. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_pretty_errors.py +0 -0
  314. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_progress.py +0 -0
  315. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_tasks.py +0 -0
  316. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_telemetry.py +0 -0
  317. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tests/test_tool_shorthand.py +0 -0
  318. {hud_python-0.4.63 → hud_python-0.4.64}/hud/utils/tool_shorthand.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.63
3
+ Version: 0.4.64
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -227,7 +227,7 @@ async def main() -> None:
227
227
  client = MCPClient(mcp_config=task.mcp_config)
228
228
  agent = ClaudeAgent(
229
229
  mcp_client=client,
230
- model="claude-sonnet-4-20250514", # requires ANTHROPIC_API_KEY
230
+ model="claude-sonnet-4-5", # requires ANTHROPIC_API_KEY
231
231
  )
232
232
 
233
233
  result = await agent.run(task)
@@ -292,7 +292,7 @@ results = await run_dataset(
292
292
  name="My SheetBench-50 Evaluation",
293
293
  dataset="hud-evals/SheetBench-50", # <-- HuggingFace dataset
294
294
  agent_class=ClaudeAgent, # <-- Your custom agent can replace this (see https://docs.hud.ai/evaluate-agents/create-agents)
295
- agent_config={"model": "claude-sonnet-4-20250514"},
295
+ agent_config={"model": "claude-sonnet-4-5"},
296
296
  max_concurrent=50,
297
297
  max_steps=30,
298
298
  )
@@ -86,7 +86,7 @@ async def main() -> None:
86
86
  client = MCPClient(mcp_config=task.mcp_config)
87
87
  agent = ClaudeAgent(
88
88
  mcp_client=client,
89
- model="claude-sonnet-4-20250514", # requires ANTHROPIC_API_KEY
89
+ model="claude-sonnet-4-5", # requires ANTHROPIC_API_KEY
90
90
  )
91
91
 
92
92
  result = await agent.run(task)
@@ -151,7 +151,7 @@ results = await run_dataset(
151
151
  name="My SheetBench-50 Evaluation",
152
152
  dataset="hud-evals/SheetBench-50", # <-- HuggingFace dataset
153
153
  agent_class=ClaudeAgent, # <-- Your custom agent can replace this (see https://docs.hud.ai/evaluate-agents/create-agents)
154
- agent_config={"model": "claude-sonnet-4-20250514"},
154
+ agent_config={"model": "claude-sonnet-4-5"},
155
155
  max_concurrent=50,
156
156
  max_steps=30,
157
157
  )
@@ -25,8 +25,6 @@ if TYPE_CHECKING:
25
25
 
26
26
  logger = logging.getLogger(__name__)
27
27
 
28
- GLOBAL_SYSTEM_PROMPT = "You are an assistant that can use tools to help the user. You will be given a task and you will need to use the tools to complete the task." # noqa: E501
29
-
30
28
 
31
29
  class MCPAgent(ABC):
32
30
  """
@@ -58,7 +56,7 @@ class MCPAgent(ABC):
58
56
  disallowed_tools: list[str] | None = None,
59
57
  response_tool_name: str | None = None,
60
58
  # Messages
61
- system_prompt: str = GLOBAL_SYSTEM_PROMPT,
59
+ system_prompt: str | None = None,
62
60
  append_setup_output: bool = True,
63
61
  initial_screenshot: bool = True,
64
62
  # Misc
@@ -155,7 +153,10 @@ class MCPAgent(ABC):
155
153
  # If task is provided, apply agent_config and add lifecycle tools
156
154
  if isinstance(task, Task) and task.agent_config:
157
155
  if task.agent_config.get("system_prompt"):
158
- self.system_prompt += "\n\n" + task.agent_config["system_prompt"]
156
+ if self.system_prompt is None:
157
+ self.system_prompt = task.agent_config["system_prompt"]
158
+ else:
159
+ self.system_prompt += "\n\n" + task.agent_config["system_prompt"]
159
160
  if "append_setup_output" in task.agent_config:
160
161
  self.append_setup_output = task.agent_config["append_setup_output"]
161
162
  if "initial_screenshot" in task.agent_config:
@@ -242,6 +243,7 @@ class MCPAgent(ABC):
242
243
  return await self._run_context(context, max_steps=max_steps)
243
244
 
244
245
  except Exception as e:
246
+ logger.exception("Error while running agent:")
245
247
  # Always return a Trace object for any exception
246
248
  if self._is_connection_error(e):
247
249
  # Return error trace for connection failures
@@ -0,0 +1,365 @@
1
+ """Claude MCP Agent implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import copy
6
+ import logging
7
+ import re
8
+ from inspect import cleandoc
9
+ from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
10
+
11
+ from anthropic import Anthropic, AsyncAnthropic, Omit
12
+ from anthropic.types import (
13
+ CacheControlEphemeralParam,
14
+ )
15
+ from anthropic.types.beta import (
16
+ BetaBase64ImageSourceParam,
17
+ BetaContentBlockParam,
18
+ BetaImageBlockParam,
19
+ BetaMessageParam,
20
+ BetaTextBlockParam,
21
+ BetaToolBash20250124Param,
22
+ BetaToolComputerUse20250124Param,
23
+ BetaToolParam,
24
+ BetaToolResultBlockParam,
25
+ BetaToolTextEditor20250728Param,
26
+ BetaToolUnionParam,
27
+ )
28
+
29
+ import hud
30
+
31
+ if TYPE_CHECKING:
32
+ from hud.datasets import Task
33
+
34
+ import mcp.types as types
35
+
36
+ from hud.settings import settings
37
+ from hud.tools.computer.settings import computer_settings
38
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult
39
+ from hud.utils.hud_console import HUDConsole
40
+
41
+ from .base import MCPAgent
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ class ClaudeAgent(MCPAgent):
47
+ """
48
+ Claude agent that uses MCP servers for tool execution.
49
+
50
+ This agent uses Claude's native tool calling capabilities but executes
51
+ tools through MCP servers instead of direct implementation.
52
+ """
53
+
54
+ metadata: ClassVar[dict[str, Any]] = {
55
+ "display_width": computer_settings.ANTHROPIC_COMPUTER_WIDTH,
56
+ "display_height": computer_settings.ANTHROPIC_COMPUTER_HEIGHT,
57
+ }
58
+
59
+ def __init__(
60
+ self,
61
+ model_client: AsyncAnthropic | None = None,
62
+ model: str = "claude-sonnet-4-5",
63
+ max_tokens: int = 16384,
64
+ use_computer_beta: bool = True,
65
+ validate_api_key: bool = True,
66
+ computer_tool_regex: str = r"(^|_)(anthropic_computer|computer_anthropic|computer)$",
67
+ **kwargs: Any,
68
+ ) -> None:
69
+ """
70
+ Initialize Claude MCP agent.
71
+
72
+ Args:
73
+ model_client: AsyncAnthropic client (created if not provided)
74
+ model: Claude model to use
75
+ max_tokens: Maximum tokens for response
76
+ use_computer_beta: Whether to use computer-use beta features
77
+ computer_tool_regex: we use this regex to identify the computer tool
78
+ **kwargs: Additional arguments passed to BaseMCPAgent (including mcp_client)
79
+ """
80
+ super().__init__(**kwargs)
81
+
82
+ # Initialize client if not provided
83
+ if model_client is None:
84
+ api_key = settings.anthropic_api_key
85
+ if not api_key:
86
+ raise ValueError("Anthropic API key not found. Set ANTHROPIC_API_KEY.")
87
+ model_client = AsyncAnthropic(api_key=api_key)
88
+
89
+ # validate api key if requested
90
+ if validate_api_key:
91
+ try:
92
+ Anthropic(api_key=model_client.api_key).models.list()
93
+ except Exception as e:
94
+ raise ValueError(f"Anthropic API key is invalid: {e}") from e
95
+
96
+ self.anthropic_client = model_client
97
+ self.model = model
98
+ self.max_tokens = max_tokens
99
+ self.use_computer_beta = use_computer_beta
100
+ self.hud_console = HUDConsole(logger=logger)
101
+
102
+ self.model_name = "Claude"
103
+ self.checkpoint_name = self.model
104
+
105
+ self.computer_tool_regex = computer_tool_regex
106
+
107
+ # these will be initialized in _convert_tools_for_claude
108
+ self.has_computer_tool = False
109
+ self.tool_mapping: dict[str, str] = {}
110
+ self.claude_tools: list[BetaToolUnionParam] = []
111
+
112
+ async def initialize(self, task: str | Task | None = None) -> None:
113
+ """Initialize the agent and build tool mappings."""
114
+ await super().initialize(task)
115
+ # Build tool mappings after tools are discovered
116
+ self._convert_tools_for_claude()
117
+
118
+ async def get_system_messages(self) -> list[Any]:
119
+ """No system messages for Claude because applied in get_response"""
120
+ return []
121
+
122
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
123
+ """Format messages for Claude."""
124
+ # Convert MCP content types to Anthropic content types
125
+ anthropic_blocks: list[BetaContentBlockParam] = []
126
+
127
+ for block in blocks:
128
+ if isinstance(block, types.TextContent):
129
+ # Only include fields that Anthropic expects
130
+ anthropic_blocks.append(
131
+ BetaTextBlockParam(
132
+ type="text",
133
+ text=block.text,
134
+ )
135
+ )
136
+ elif isinstance(block, types.ImageContent):
137
+ # Convert MCP ImageContent to Anthropic format
138
+ anthropic_blocks.append(
139
+ BetaImageBlockParam(
140
+ type="image",
141
+ source=BetaBase64ImageSourceParam(
142
+ type="base64",
143
+ media_type=cast(
144
+ "Literal['image/jpeg', 'image/png', 'image/gif', 'image/webp']",
145
+ block.mimeType,
146
+ ),
147
+ data=block.data,
148
+ ),
149
+ )
150
+ )
151
+ else:
152
+ raise ValueError(f"Unknown content block type: {type(block)}")
153
+
154
+ return [BetaMessageParam(role="user", content=anthropic_blocks)]
155
+
156
+ @hud.instrument(
157
+ span_type="agent",
158
+ record_args=False, # Messages can be large
159
+ record_result=True,
160
+ )
161
+ async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
162
+ """Get response from Claude including any tool calls."""
163
+
164
+ messages_cached = self._add_prompt_caching(messages)
165
+
166
+ response = await self.anthropic_client.beta.messages.create(
167
+ model=self.model,
168
+ system=self.system_prompt if self.system_prompt is not None else Omit(),
169
+ max_tokens=self.max_tokens,
170
+ messages=messages_cached,
171
+ tools=self.claude_tools,
172
+ tool_choice={"type": "auto", "disable_parallel_tool_use": True},
173
+ betas=["computer-use-2025-01-24"] if self.has_computer_tool else [],
174
+ )
175
+
176
+ messages.append(
177
+ BetaMessageParam(
178
+ role="assistant",
179
+ content=response.content,
180
+ )
181
+ )
182
+
183
+ # Process response
184
+ result = AgentResponse(content="", tool_calls=[], done=True)
185
+
186
+ # Extract text content and reasoning
187
+ text_content = ""
188
+ thinking_content = ""
189
+
190
+ for block in response.content:
191
+ if block.type == "tool_use":
192
+ tool_call = MCPToolCall(
193
+ id=block.id,
194
+ # look up name in tool_mapping if available, otherwise use block name
195
+ name=self.tool_mapping.get(block.name, block.name),
196
+ arguments=block.input,
197
+ )
198
+ result.tool_calls.append(tool_call)
199
+ result.done = False
200
+ elif block.type == "text":
201
+ text_content += block.text
202
+ elif hasattr(block, "type") and block.type == "thinking":
203
+ thinking_content += f"Thinking: {block.thinking}\n"
204
+
205
+ result.content = thinking_content + text_content
206
+
207
+ return result
208
+
209
+ async def format_tool_results(
210
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
211
+ ) -> list[BetaMessageParam]:
212
+ """Format tool results into Claude messages."""
213
+ # Process each tool result
214
+ user_content = []
215
+
216
+ for tool_call, result in zip(tool_calls, tool_results, strict=True):
217
+ # Extract Claude-specific metadata from extra fields
218
+ tool_use_id = tool_call.id
219
+ if not tool_use_id:
220
+ self.hud_console.warning(f"No tool_use_id found for {tool_call.name}")
221
+ continue
222
+
223
+ # Convert MCP tool results to Claude format
224
+ claude_blocks = []
225
+
226
+ if result.isError:
227
+ # Extract error message from content
228
+ error_msg = "Tool execution failed"
229
+ for content in result.content:
230
+ if isinstance(content, types.TextContent):
231
+ error_msg = content.text
232
+ break
233
+ claude_blocks.append(text_to_content_block(f"Error: {error_msg}"))
234
+ else:
235
+ # Process success content
236
+ for content in result.content:
237
+ if isinstance(content, types.TextContent):
238
+ claude_blocks.append(text_to_content_block(content.text))
239
+ elif isinstance(content, types.ImageContent):
240
+ claude_blocks.append(base64_to_content_block(content.data))
241
+
242
+ # Add tool result
243
+ user_content.append(tool_use_content_block(tool_use_id, claude_blocks))
244
+
245
+ # Return as a user message containing all tool results
246
+ return [
247
+ BetaMessageParam(
248
+ role="user",
249
+ content=user_content,
250
+ )
251
+ ]
252
+
253
+ async def create_user_message(self, text: str) -> BetaMessageParam:
254
+ """Create a user message in Claude's format."""
255
+ return BetaMessageParam(role="user", content=text)
256
+
257
+ def _convert_tools_for_claude(self) -> None:
258
+ """Convert MCP tools to Claude API tools."""
259
+
260
+ def to_api_tool(tool: types.Tool) -> BetaToolUnionParam:
261
+ if tool.name == "str_replace_based_edit_tool":
262
+ return BetaToolTextEditor20250728Param(
263
+ type="text_editor_20250728",
264
+ name="str_replace_based_edit_tool",
265
+ cache_control=CacheControlEphemeralParam(type="ephemeral"),
266
+ )
267
+ if tool.name == "bash":
268
+ return BetaToolBash20250124Param(
269
+ type="bash_20250124",
270
+ name="bash",
271
+ cache_control=CacheControlEphemeralParam(type="ephemeral"),
272
+ )
273
+ if re.fullmatch(self.computer_tool_regex, tool.name):
274
+ return BetaToolComputerUse20250124Param(
275
+ type="computer_20250124",
276
+ name="computer",
277
+ display_number=1,
278
+ display_width_px=computer_settings.ANTHROPIC_COMPUTER_WIDTH,
279
+ display_height_px=computer_settings.ANTHROPIC_COMPUTER_HEIGHT,
280
+ cache_control=CacheControlEphemeralParam(type="ephemeral"),
281
+ )
282
+
283
+ if tool.description is None or tool.inputSchema is None:
284
+ raise ValueError(
285
+ cleandoc(f"""MCP tool {tool.name} requires both a description and inputSchema.
286
+ Add these by:
287
+ 1. Adding a docstring to your @mcp.tool decorated function for the description
288
+ 2. Using pydantic Field() annotations on function parameters for the schema
289
+ """)
290
+ )
291
+ """Convert a tool to the API format"""
292
+ return BetaToolParam(
293
+ name=tool.name,
294
+ description=tool.description,
295
+ input_schema=tool.inputSchema,
296
+ cache_control=CacheControlEphemeralParam(type="ephemeral"),
297
+ )
298
+
299
+ self.has_computer_tool = False
300
+ self.tool_mapping = {}
301
+ self.claude_tools = []
302
+ for tool in self.get_available_tools():
303
+ claude_tool = to_api_tool(tool)
304
+ # warn if multiple computer tools are found
305
+ if claude_tool["name"] == "computer":
306
+ if self.has_computer_tool:
307
+ logger.warning(
308
+ "Multiple computer tools found. Ignoring %s since %s is already present",
309
+ tool.name,
310
+ self.tool_mapping["computer"],
311
+ )
312
+ continue
313
+ else:
314
+ self.has_computer_tool = True
315
+ self.tool_mapping[claude_tool["name"]] = tool.name
316
+ self.claude_tools.append(claude_tool)
317
+
318
+ def _add_prompt_caching(self, messages: list[BetaMessageParam]) -> list[BetaMessageParam]:
319
+ """Add prompt caching to messages."""
320
+ messages_cached = copy.deepcopy(messages)
321
+ cache_control: CacheControlEphemeralParam = {"type": "ephemeral"}
322
+
323
+ # Mark last user message with cache control
324
+ if (
325
+ messages_cached
326
+ and isinstance(messages_cached[-1], dict)
327
+ and messages_cached[-1].get("role") == "user"
328
+ ):
329
+ last_content = messages_cached[-1]["content"]
330
+ # Content is formatted to be list of ContentBlock in format_blocks and format_message
331
+ if isinstance(last_content, list):
332
+ for block in last_content:
333
+ # Only add cache control to dict-like block types that support it
334
+ if isinstance(block, dict):
335
+ match block["type"]:
336
+ case "redacted_thinking" | "thinking":
337
+ pass
338
+ case _:
339
+ block["cache_control"] = cache_control
340
+
341
+ return messages_cached
342
+
343
+
344
+ def base64_to_content_block(base64: str) -> BetaImageBlockParam:
345
+ """Convert base64 image to Claude content block."""
346
+ return BetaImageBlockParam(
347
+ type="image",
348
+ source=BetaBase64ImageSourceParam(
349
+ type="base64",
350
+ media_type="image/png",
351
+ data=base64,
352
+ ),
353
+ )
354
+
355
+
356
+ def text_to_content_block(text: str) -> BetaTextBlockParam:
357
+ """Convert text to Claude content block."""
358
+ return {"type": "text", "text": text}
359
+
360
+
361
+ def tool_use_content_block(
362
+ tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]
363
+ ) -> BetaToolResultBlockParam:
364
+ """Create tool result content block."""
365
+ return {"type": "tool_result", "tool_use_id": tool_use_id, "content": content}
@@ -89,7 +89,10 @@ class LangChainAgent(MCPAgent):
89
89
 
90
90
  async def get_system_messages(self) -> list[BaseMessage]:
91
91
  """Get system messages for LangChain."""
92
- return [SystemMessage(content=self.system_prompt)]
92
+ if self.system_prompt is not None:
93
+ return [SystemMessage(content=self.system_prompt)]
94
+ else:
95
+ return []
93
96
 
94
97
  async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[BaseMessage]:
95
98
  """Create initial messages for LangChain."""
@@ -84,7 +84,10 @@ class GenericOpenAIChatAgent(MCPAgent):
84
84
 
85
85
  async def get_system_messages(self) -> list[Any]:
86
86
  """Get system messages for OpenAI."""
87
- return [{"role": "system", "content": self.system_prompt}]
87
+ if self.system_prompt is not None:
88
+ return [{"role": "system", "content": self.system_prompt}]
89
+ else:
90
+ return []
88
91
 
89
92
  async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
90
93
  """Format blocks for OpenAI."""
@@ -96,7 +96,6 @@ class TestBaseMCPAgent:
96
96
  assert agent.allowed_tools is None
97
97
  assert agent.disallowed_tools is None
98
98
  assert agent.initial_screenshot is True
99
- assert agent.system_prompt is not None # Default system prompt is set
100
99
 
101
100
  def test_init_with_params(self):
102
101
  """Test initialization with custom parameters."""
@@ -232,7 +232,7 @@ def build_agent(
232
232
  )
233
233
  raise typer.Exit(1) from e
234
234
 
235
- model = model or "claude-sonnet-4-20250514"
235
+ model = model or "claude-sonnet-4-5"
236
236
 
237
237
  if allowed_tools:
238
238
  return ClaudeAgent(
@@ -393,7 +393,7 @@ async def run_single_task(
393
393
 
394
394
  agent_class = ClaudeAgent
395
395
  agent_config = {
396
- "model": model or "claude-sonnet-4-20250514",
396
+ "model": model or "claude-sonnet-4-5",
397
397
  "verbose": verbose,
398
398
  "validate_api_key": False,
399
399
  }
@@ -626,7 +626,7 @@ async def run_full_dataset(
626
626
  raise typer.Exit(1) from e
627
627
 
628
628
  agent_config = {
629
- "model": model or "claude-sonnet-4-20250514",
629
+ "model": model or "claude-sonnet-4-5",
630
630
  "verbose": verbose,
631
631
  "validate_api_key": False,
632
632
  }