hud-python 0.5.25__tar.gz → 0.5.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (347) hide show
  1. {hud_python-0.5.25 → hud_python-0.5.27}/PKG-INFO +1 -1
  2. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/claude.py +96 -21
  3. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_claude.py +138 -0
  4. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/dev.py +11 -12
  5. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/eval.py +16 -26
  6. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/dev.py +3 -2
  7. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/remove.py +3 -2
  8. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_build.py +2 -2
  9. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/interactive.py +5 -3
  10. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/version_check.py +6 -9
  11. {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/loader.py +15 -10
  12. {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/runner.py +1 -1
  13. {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/tests/test_loader.py +62 -6
  14. {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/utils.py +17 -4
  15. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/__init__.py +2 -1
  16. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/scenarios.py +128 -45
  17. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_scenarios.py +78 -9
  18. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_tools.py +56 -0
  19. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/context.py +14 -4
  20. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/display.py +12 -7
  21. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/manager.py +6 -50
  22. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/task.py +59 -5
  23. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/test_context.py +48 -0
  24. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/test_eval.py +35 -0
  25. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/test_task.py +56 -0
  26. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/types.py +1 -1
  27. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/anthropic.py +2 -2
  28. {hud_python-0.5.25 → hud_python-0.5.27}/hud/types.py +11 -2
  29. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_version.py +1 -1
  30. {hud_python-0.5.25 → hud_python-0.5.27}/hud/version.py +1 -1
  31. {hud_python-0.5.25 → hud_python-0.5.27}/pyproject.toml +1 -1
  32. {hud_python-0.5.25 → hud_python-0.5.27}/.gitignore +0 -0
  33. {hud_python-0.5.25 → hud_python-0.5.27}/LICENSE +0 -0
  34. {hud_python-0.5.25 → hud_python-0.5.27}/README.md +0 -0
  35. {hud_python-0.5.25 → hud_python-0.5.27}/examples/README.md +0 -0
  36. {hud_python-0.5.25 → hud_python-0.5.27}/hud/__init__.py +0 -0
  37. {hud_python-0.5.25 → hud_python-0.5.27}/hud/__main__.py +0 -0
  38. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/__init__.py +0 -0
  39. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/base.py +0 -0
  40. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/gateway.py +0 -0
  41. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/gemini.py +0 -0
  42. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/gemini_cua.py +0 -0
  43. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/grounded_openai.py +0 -0
  44. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/misc/__init__.py +0 -0
  45. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/misc/integration_test_agent.py +0 -0
  46. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/misc/response_agent.py +0 -0
  47. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/openai.py +0 -0
  48. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/openai_chat.py +0 -0
  49. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/operator.py +0 -0
  50. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/resolver.py +0 -0
  51. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/__init__.py +0 -0
  52. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/conftest.py +0 -0
  53. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_base.py +0 -0
  54. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_base_runtime.py +0 -0
  55. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_gemini.py +0 -0
  56. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  57. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_integration_test_agent.py +0 -0
  58. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_openai.py +0 -0
  59. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_operator.py +0 -0
  60. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_resolver.py +0 -0
  61. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_run_eval.py +0 -0
  62. {hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/types.py +0 -0
  63. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/__init__.py +0 -0
  64. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/__main__.py +0 -0
  65. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/analyze.py +0 -0
  66. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/build.py +0 -0
  67. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/clone.py +0 -0
  68. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/__init__.py +0 -0
  69. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/base.py +0 -0
  70. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/harbor.py +0 -0
  71. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/tests/__init__.py +0 -0
  72. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/tests/conftest.py +0 -0
  73. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/convert/tests/test_harbor.py +0 -0
  74. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/debug.py +0 -0
  75. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/deploy.py +0 -0
  76. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/__init__.py +0 -0
  77. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/init.py +0 -0
  78. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/tasks.py +0 -0
  79. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/templates.py +0 -0
  80. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/tests/__init__.py +0 -0
  81. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/tests/test_dev.py +0 -0
  82. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/get.py +0 -0
  83. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/init.py +0 -0
  84. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/link.py +0 -0
  85. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/list_func.py +0 -0
  86. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/pull.py +0 -0
  87. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/push.py +0 -0
  88. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/rft.py +0 -0
  89. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/rft_status.py +0 -0
  90. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/__init__.py +0 -0
  91. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_analyze.py +0 -0
  92. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_analyze_metadata.py +0 -0
  93. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_analyze_module.py +0 -0
  94. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_build_failure.py +0 -0
  95. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_build_module.py +0 -0
  96. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_cli_init.py +0 -0
  97. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_cli_main.py +0 -0
  98. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
  99. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_cli_root.py +0 -0
  100. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_clone.py +0 -0
  101. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_convert.py +0 -0
  102. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_cursor.py +0 -0
  103. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_debug.py +0 -0
  104. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_debug_directory_mode.py +0 -0
  105. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_deploy.py +0 -0
  106. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_dev.py +0 -0
  107. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_eval.py +0 -0
  108. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_eval_bedrock.py +0 -0
  109. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_init.py +0 -0
  110. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_list_func.py +0 -0
  111. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_main_module.py +0 -0
  112. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_mcp_server.py +0 -0
  113. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_pull.py +0 -0
  114. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_push.py +0 -0
  115. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_push_happy.py +0 -0
  116. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_push_wrapper.py +0 -0
  117. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_registry.py +0 -0
  118. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_utils.py +0 -0
  119. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/__init__.py +0 -0
  120. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/build_display.py +0 -0
  121. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/build_logs.py +0 -0
  122. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/celebrate.py +0 -0
  123. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/config.py +0 -0
  124. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/context.py +0 -0
  125. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/cursor.py +0 -0
  126. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/docker.py +0 -0
  127. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/env_check.py +0 -0
  128. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/environment.py +0 -0
  129. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/git.py +0 -0
  130. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/local_runner.py +0 -0
  131. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/logging.py +0 -0
  132. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/mcp.py +0 -0
  133. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/metadata.py +0 -0
  134. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/package_runner.py +0 -0
  135. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/registry.py +0 -0
  136. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/remote_runner.py +0 -0
  137. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/runner.py +0 -0
  138. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/server.py +0 -0
  139. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/source_hash.py +0 -0
  140. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tasks.py +0 -0
  141. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/__init__.py +0 -0
  142. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_config.py +0 -0
  143. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_docker.py +0 -0
  144. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_docker_hints.py +0 -0
  145. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_env_check.py +0 -0
  146. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_environment.py +0 -0
  147. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_git.py +0 -0
  148. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_interactive_module.py +0 -0
  149. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_local_runner.py +0 -0
  150. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_logging_utils.py +0 -0
  151. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_metadata.py +0 -0
  152. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_package_runner.py +0 -0
  153. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_registry_utils.py +0 -0
  154. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_remote_runner.py +0 -0
  155. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_runner_modules.py +0 -0
  156. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_source_hash.py +0 -0
  157. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/tests/test_tasks.py +0 -0
  158. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/validation.py +0 -0
  159. {hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/viewer.py +0 -0
  160. {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/__init__.py +0 -0
  161. {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/tests/__init__.py +0 -0
  162. {hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/tests/test_utils.py +0 -0
  163. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connection.py +0 -0
  164. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/__init__.py +0 -0
  165. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/base.py +0 -0
  166. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/local.py +0 -0
  167. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/mcp_config.py +0 -0
  168. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/openai.py +0 -0
  169. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/connectors/remote.py +0 -0
  170. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/environment.py +0 -0
  171. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/__init__.py +0 -0
  172. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/adk.py +0 -0
  173. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/anthropic.py +0 -0
  174. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/gemini.py +0 -0
  175. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/langchain.py +0 -0
  176. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/llamaindex.py +0 -0
  177. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/integrations/openai.py +0 -0
  178. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/mock.py +0 -0
  179. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/router.py +0 -0
  180. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/__init__.py +0 -0
  181. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_connection.py +0 -0
  182. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_connectors.py +0 -0
  183. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_environment.py +0 -0
  184. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_integrations.py +0 -0
  185. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/tests/test_local_connectors.py +0 -0
  186. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/types.py +0 -0
  187. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/utils/__init__.py +0 -0
  188. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/utils/formats.py +0 -0
  189. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/utils/schema.py +0 -0
  190. {hud_python-0.5.25 → hud_python-0.5.27}/hud/environment/utils/tool_wrappers.py +0 -0
  191. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/__init__.py +0 -0
  192. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/instrument.py +0 -0
  193. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/parallel.py +0 -0
  194. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/__init__.py +0 -0
  195. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/test_manager.py +0 -0
  196. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/tests/test_parallel.py +0 -0
  197. {hud_python-0.5.25 → hud_python-0.5.27}/hud/eval/utils.py +0 -0
  198. {hud_python-0.5.25 → hud_python-0.5.27}/hud/native/__init__.py +0 -0
  199. {hud_python-0.5.25 → hud_python-0.5.27}/hud/native/comparator.py +0 -0
  200. {hud_python-0.5.25 → hud_python-0.5.27}/hud/native/tests/__init__.py +0 -0
  201. {hud_python-0.5.25 → hud_python-0.5.27}/hud/native/tests/test_comparator.py +0 -0
  202. {hud_python-0.5.25 → hud_python-0.5.27}/hud/native/tests/test_native_init.py +0 -0
  203. {hud_python-0.5.25 → hud_python-0.5.27}/hud/patches/__init__.py +0 -0
  204. {hud_python-0.5.25 → hud_python-0.5.27}/hud/patches/mcp_patches.py +0 -0
  205. {hud_python-0.5.25 → hud_python-0.5.27}/hud/patches/warnings.py +0 -0
  206. {hud_python-0.5.25 → hud_python-0.5.27}/hud/py.typed +0 -0
  207. {hud_python-0.5.25 → hud_python-0.5.27}/hud/samples/__init__.py +0 -0
  208. {hud_python-0.5.25 → hud_python-0.5.27}/hud/samples/browser.py +0 -0
  209. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/__init__.py +0 -0
  210. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/context.py +0 -0
  211. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/helper/__init__.py +0 -0
  212. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/low_level.py +0 -0
  213. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/router.py +0 -0
  214. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/server.py +0 -0
  215. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/__init__.py +0 -0
  216. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_add_tool.py +0 -0
  217. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_context.py +0 -0
  218. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_mcp_server_handlers.py +0 -0
  219. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_mcp_server_integration.py +0 -0
  220. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_mcp_server_more.py +0 -0
  221. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_run_wrapper.py +0 -0
  222. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_server_extra.py +0 -0
  223. {hud_python-0.5.25 → hud_python-0.5.27}/hud/server/tests/test_sigterm_runner.py +0 -0
  224. {hud_python-0.5.25 → hud_python-0.5.27}/hud/settings.py +0 -0
  225. {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/__init__.py +0 -0
  226. {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/exceptions.py +0 -0
  227. {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/hints.py +0 -0
  228. {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/requests.py +0 -0
  229. {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/tests/__init__.py +0 -0
  230. {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/tests/test_exceptions.py +0 -0
  231. {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/tests/test_hints.py +0 -0
  232. {hud_python-0.5.25 → hud_python-0.5.27}/hud/shared/tests/test_requests.py +0 -0
  233. {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/__init__.py +0 -0
  234. {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/exporter.py +0 -0
  235. {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/instrument.py +0 -0
  236. {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/tests/__init__.py +0 -0
  237. {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/tests/test_eval_telemetry.py +0 -0
  238. {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/tests/test_exporter.py +0 -0
  239. {hud_python-0.5.25 → hud_python-0.5.27}/hud/telemetry/tests/test_instrument.py +0 -0
  240. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/__init__.py +0 -0
  241. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/agent.py +0 -0
  242. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/base.py +0 -0
  243. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/__init__.py +0 -0
  244. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/apply_patch.py +0 -0
  245. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/bash.py +0 -0
  246. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/edit.py +0 -0
  247. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/gemini_edit.py +0 -0
  248. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/gemini_shell.py +0 -0
  249. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/session.py +0 -0
  250. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/shell.py +0 -0
  251. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/__init__.py +0 -0
  252. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_apply_patch.py +0 -0
  253. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_bash.py +0 -0
  254. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_bash_extended.py +0 -0
  255. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_bash_integration.py +0 -0
  256. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_edit.py +0 -0
  257. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_gemini_tools.py +0 -0
  258. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/tests/test_shell.py +0 -0
  259. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/coding/utils.py +0 -0
  260. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/__init__.py +0 -0
  261. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/gemini.py +0 -0
  262. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/glm.py +0 -0
  263. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/hud.py +0 -0
  264. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/openai.py +0 -0
  265. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/qwen.py +0 -0
  266. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/settings.py +0 -0
  267. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/tests/__init__.py +0 -0
  268. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/tests/test_computer.py +0 -0
  269. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/tests/test_computer_actions.py +0 -0
  270. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/computer/tests/test_glm_computer.py +0 -0
  271. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/__init__.py +0 -0
  272. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/base.py +0 -0
  273. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/pyautogui.py +0 -0
  274. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/tests/__init__.py +0 -0
  275. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/tests/test_base_executor.py +0 -0
  276. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  277. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/executors/xdo.py +0 -0
  278. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/__init__.py +0 -0
  279. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/base.py +0 -0
  280. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/gemini.py +0 -0
  281. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/glob.py +0 -0
  282. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/grep.py +0 -0
  283. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/list.py +0 -0
  284. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/read.py +0 -0
  285. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/tests/__init__.py +0 -0
  286. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/tests/test_glob.py +0 -0
  287. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/tests/test_grep.py +0 -0
  288. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/tests/test_list.py +0 -0
  289. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/filesystem/tests/test_read.py +0 -0
  290. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/__init__.py +0 -0
  291. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/config.py +0 -0
  292. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/grounded_tool.py +0 -0
  293. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/grounder.py +0 -0
  294. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/tests/__init__.py +0 -0
  295. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  296. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/__init__.py +0 -0
  297. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/base.py +0 -0
  298. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/code_execution.py +0 -0
  299. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/google_search.py +0 -0
  300. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/url_context.py +0 -0
  301. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/web_fetch.py +0 -0
  302. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/hosted/web_search.py +0 -0
  303. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/jupyter.py +0 -0
  304. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/__init__.py +0 -0
  305. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/base.py +0 -0
  306. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/claude.py +0 -0
  307. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/gemini.py +0 -0
  308. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/session.py +0 -0
  309. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/tests/__init__.py +0 -0
  310. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/tests/test_claude.py +0 -0
  311. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/tests/test_gemini.py +0 -0
  312. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/memory/tests/test_session.py +0 -0
  313. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/native_types.py +0 -0
  314. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/playwright.py +0 -0
  315. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/response.py +0 -0
  316. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/submit.py +0 -0
  317. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/__init__.py +0 -0
  318. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_agent_tool.py +0 -0
  319. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_base.py +0 -0
  320. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_init.py +0 -0
  321. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_jupyter_tool.py +0 -0
  322. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_native_tool_e2e.py +0 -0
  323. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_native_types.py +0 -0
  324. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_playwright_tool.py +0 -0
  325. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_response.py +0 -0
  326. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_submit.py +0 -0
  327. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_tools.py +0 -0
  328. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_tools_init.py +0 -0
  329. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_types.py +0 -0
  330. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/tests/test_utils.py +0 -0
  331. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/types.py +0 -0
  332. {hud_python-0.5.25 → hud_python-0.5.27}/hud/tools/utils.py +0 -0
  333. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/__init__.py +0 -0
  334. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/env.py +0 -0
  335. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/hud_console.py +0 -0
  336. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/mcp.py +0 -0
  337. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/pretty_errors.py +0 -0
  338. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/strict_schema.py +0 -0
  339. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/telemetry.py +0 -0
  340. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/__init__.py +0 -0
  341. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_init.py +0 -0
  342. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_mcp.py +0 -0
  343. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_pretty_errors.py +0 -0
  344. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_telemetry.py +0 -0
  345. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tests/test_tool_shorthand.py +0 -0
  346. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/tool_shorthand.py +0 -0
  347. {hud_python-0.5.25 → hud_python-0.5.27}/hud/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.5.25
3
+ Version: 0.5.27
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import copy
6
+ import json
6
7
  import logging
7
8
  from inspect import cleandoc
8
9
  from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
@@ -85,7 +86,12 @@ class ClaudeAgent(MCPAgent):
85
86
  logger.debug("Legacy fallback: detected %s as computer tool", tool.name)
86
87
  model_lower = (self.model or "").lower()
87
88
  if any(
88
- fnmatch.fnmatch(model_lower, p) for p in ("claude-opus-4-5*", "claude-opus-4-6*")
89
+ fnmatch.fnmatch(model_lower, p)
90
+ for p in (
91
+ "claude-opus-4-5*",
92
+ "claude-opus-4-6*",
93
+ "claude-sonnet-4-6*",
94
+ )
89
95
  ):
90
96
  return NativeToolSpec(
91
97
  api_type="computer_20251124",
@@ -149,15 +155,15 @@ class ClaudeAgent(MCPAgent):
149
155
 
150
156
  # these will be initialized in _convert_tools_for_claude
151
157
  self.has_computer_tool = False
152
- self.tool_mapping: dict[str, str] = {}
153
- self.claude_tools: list[BetaToolUnionParam] = []
154
- self._required_betas: set[str] = set()
158
+ self.tool_mapping = {}
159
+ self.claude_tools = []
160
+ self._required_betas = set()
155
161
 
156
162
  def _on_tools_ready(self) -> None:
157
163
  """Build Claude-specific tool mappings after tools are discovered."""
158
164
  self._convert_tools_for_claude()
159
165
 
160
- async def get_system_messages(self) -> list[BetaMessageParam]:
166
+ async def get_system_messages(self) -> list[types.ContentBlock]:
161
167
  """No system messages for Claude because applied in get_response"""
162
168
  return []
163
169
 
@@ -195,10 +201,42 @@ class ClaudeAgent(MCPAgent):
195
201
 
196
202
  return [BetaMessageParam(role="user", content=anthropic_blocks)]
197
203
 
204
+ @staticmethod
205
+ def _extract_invalid_tool_json(exc: Exception) -> str | None:
206
+ """Extract malformed tool JSON payload from Anthropic stream errors.
207
+
208
+ Returns None when the exception is unrelated to tool JSON parsing.
209
+ """
210
+ message = str(exc)
211
+ parse_error_prefix = "Unable to parse tool parameter JSON from model."
212
+ if parse_error_prefix not in message:
213
+ return None
214
+
215
+ marker = "JSON: "
216
+ marker_index = message.find(marker)
217
+ if marker_index == -1:
218
+ return ""
219
+
220
+ return message[marker_index + len(marker) :].strip()
221
+
222
+ @staticmethod
223
+ def _build_invalid_tool_json_retry_message(invalid_json: str) -> BetaMessageParam:
224
+ """Build a user message prompting the model to re-emit valid tool JSON."""
225
+ wrapped = json.dumps({"INVALID_JSON": invalid_json}, ensure_ascii=True)
226
+ retry_text = (
227
+ "Your previous tool-call arguments were invalid JSON and could not be parsed.\n"
228
+ "Retry the same intended tool call once with valid JSON arguments only.\n"
229
+ "Ensure all strings are quoted and all arrays/objects are valid JSON.\n"
230
+ f"Malformed payload (wrapped): {wrapped}"
231
+ )
232
+ return BetaMessageParam(
233
+ role="user",
234
+ content=[text_to_content_block(retry_text)],
235
+ )
236
+
198
237
  async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
199
238
  """Get response from Claude including any tool calls."""
200
239
  messages_cached = self._add_prompt_caching(messages)
201
-
202
240
  # betas to use - collected during tool conversion based on native specs
203
241
  # Only pass betas when non-empty; an empty list can produce an empty
204
242
  # anthropic-beta header which the API rejects.
@@ -223,21 +261,58 @@ class ClaudeAgent(MCPAgent):
223
261
  ) from None
224
262
  else:
225
263
  # Regular Anthropic client supports .stream()
226
- async with self.anthropic_client.beta.messages.stream(
227
- model=self.config.model,
228
- system=self.system_prompt if self.system_prompt is not None else Omit(),
229
- max_tokens=self.max_tokens,
230
- messages=messages_cached,
231
- tools=self.claude_tools,
232
- tool_choice={"type": "auto", "disable_parallel_tool_use": True},
233
- betas=betas,
234
- ) as stream:
235
- # allow backend to accumulate message content
236
- async for _ in stream:
237
- pass
238
- # get final message
239
- response = await stream.get_final_message()
240
- messages.append(BetaMessageParam(role="assistant", content=response.content))
264
+ response = None
265
+ invalid_json_failures = 0
266
+ for _ in range(3):
267
+ messages_cached = self._add_prompt_caching(messages)
268
+ try:
269
+ async with self.anthropic_client.beta.messages.stream(
270
+ model=self.config.model,
271
+ system=self.system_prompt if self.system_prompt is not None else Omit(),
272
+ max_tokens=self.max_tokens,
273
+ messages=messages_cached,
274
+ tools=self.claude_tools,
275
+ tool_choice={"type": "auto", "disable_parallel_tool_use": True},
276
+ betas=betas,
277
+ ) as stream:
278
+ # allow backend to accumulate message content
279
+ async for _ in stream:
280
+ pass
281
+ # get final message
282
+ response = await stream.get_final_message()
283
+ messages.append(
284
+ BetaMessageParam(
285
+ role="assistant",
286
+ content=response.content,
287
+ )
288
+ )
289
+ break
290
+ except ValueError as exc:
291
+ invalid_json = self._extract_invalid_tool_json(exc)
292
+ is_retryable = invalid_json is not None
293
+ if not is_retryable:
294
+ raise
295
+
296
+ invalid_json_failures += 1
297
+ if invalid_json_failures == 1:
298
+ logger.warning(
299
+ "Claude returned invalid streamed tool JSON; "
300
+ "retrying same generation once"
301
+ )
302
+ continue
303
+
304
+ if invalid_json_failures == 2:
305
+ logger.warning(
306
+ "Claude returned invalid streamed tool JSON twice; "
307
+ "retrying once with INVALID_JSON guidance"
308
+ )
309
+ messages.append(self._build_invalid_tool_json_retry_message(invalid_json))
310
+ continue
311
+
312
+ raise
313
+
314
+ if response is None:
315
+ raise ValueError("Claude response missing after stream retries")
241
316
 
242
317
  # Process response
243
318
  result = AgentResponse(content="", tool_calls=[], done=True)
@@ -99,6 +99,30 @@ class MockStreamContextManager:
99
99
  return self.response
100
100
 
101
101
 
102
+ class MockErrorStreamContextManager:
103
+ """Mock stream context manager that raises a fixed error while streaming."""
104
+
105
+ def __init__(self, error: Exception) -> None:
106
+ self.error = error
107
+
108
+ async def __aenter__(self) -> MockErrorStreamContextManager:
109
+ return self
110
+
111
+ async def __aexit__(
112
+ self, exc_type: type | None, exc_val: Exception | None, exc_tb: Any
113
+ ) -> bool:
114
+ return False
115
+
116
+ def __aiter__(self) -> MockErrorStreamContextManager:
117
+ return self
118
+
119
+ async def __anext__(self) -> None:
120
+ raise self.error
121
+
122
+ async def get_final_message(self) -> MagicMock:
123
+ raise AssertionError("get_final_message should not be called when stream iteration fails")
124
+
125
+
102
126
  class TestClaudeHelperFunctions:
103
127
  """Test helper functions for Claude message formatting."""
104
128
 
@@ -410,6 +434,120 @@ class TestClaudeAgent:
410
434
  assert response.tool_calls[0].name == "my_tool"
411
435
  assert response.tool_calls[0].arguments == {"x": "value"}
412
436
 
437
+ @pytest.mark.asyncio
438
+ async def test_get_response_retries_same_generation_once_on_invalid_streamed_tool_json(
439
+ self, mock_anthropic: AsyncAnthropic
440
+ ) -> None:
441
+ """First invalid streamed tool JSON should retry without adding guidance."""
442
+ invalid_json_error = ValueError(
443
+ "Unable to parse tool parameter JSON from model. Please retry your request or "
444
+ "adjust your "
445
+ 'prompt. Error: expected value at line 1 column 10. JSON: {"labels": bug}'
446
+ )
447
+ first_stream = MockErrorStreamContextManager(invalid_json_error)
448
+
449
+ mock_response = MagicMock()
450
+ mock_response.content = [MagicMock(type="text", text="Recovered")]
451
+ second_stream = MockStreamContextManager(mock_response)
452
+
453
+ mock_anthropic.beta.messages.stream = MagicMock(side_effect=[first_stream, second_stream])
454
+
455
+ agent = ClaudeAgent.create(
456
+ model_client=mock_anthropic,
457
+ validate_api_key=False,
458
+ )
459
+ agent.claude_tools = []
460
+ agent.tool_mapping = {}
461
+ agent.has_computer_tool = False
462
+ agent._initialized = True
463
+
464
+ messages: list[BetaMessageParam] = [
465
+ cast(
466
+ "BetaMessageParam",
467
+ {"role": "user", "content": [{"type": "text", "text": "Create a Linear ticket"}]},
468
+ )
469
+ ]
470
+
471
+ response = await agent.get_response(messages)
472
+
473
+ assert response.content == "Recovered"
474
+ assert mock_anthropic.beta.messages.stream.call_count == 2
475
+ # Original user message + assistant response (no guidance message needed)
476
+ assert len(messages) == 2
477
+ assert messages[1]["role"] == "assistant"
478
+
479
+ @pytest.mark.asyncio
480
+ async def test_get_response_adds_invalid_json_guidance_after_second_failure(
481
+ self, mock_anthropic: AsyncAnthropic
482
+ ) -> None:
483
+ """Second consecutive invalid JSON failure should add INVALID_JSON guidance."""
484
+ invalid_json_error = ValueError(
485
+ "Unable to parse tool parameter JSON from model. Please retry your request or "
486
+ "adjust your "
487
+ 'prompt. Error: expected value at line 1 column 10. JSON: {"labels": bug}'
488
+ )
489
+ first_stream = MockErrorStreamContextManager(invalid_json_error)
490
+ second_stream = MockErrorStreamContextManager(invalid_json_error)
491
+
492
+ mock_response = MagicMock()
493
+ mock_response.content = [MagicMock(type="text", text="Recovered after guidance")]
494
+ third_stream = MockStreamContextManager(mock_response)
495
+
496
+ mock_anthropic.beta.messages.stream = MagicMock(
497
+ side_effect=[first_stream, second_stream, third_stream]
498
+ )
499
+
500
+ agent = ClaudeAgent.create(
501
+ model_client=mock_anthropic,
502
+ validate_api_key=False,
503
+ )
504
+ agent.claude_tools = []
505
+ agent.tool_mapping = {}
506
+ agent.has_computer_tool = False
507
+ agent._initialized = True
508
+
509
+ messages: list[BetaMessageParam] = [
510
+ cast(
511
+ "BetaMessageParam",
512
+ {"role": "user", "content": [{"type": "text", "text": "Create a Linear ticket"}]},
513
+ )
514
+ ]
515
+
516
+ response = await agent.get_response(messages)
517
+
518
+ assert response.content == "Recovered after guidance"
519
+ assert mock_anthropic.beta.messages.stream.call_count == 3
520
+ # Original user message + INVALID_JSON guidance + assistant response
521
+ assert len(messages) == 3
522
+ retry_message = messages[1]
523
+ assert retry_message["role"] == "user"
524
+ retry_content = cast("list[dict[str, Any]]", retry_message["content"])
525
+ assert "INVALID_JSON" in retry_content[0]["text"]
526
+
527
+ @pytest.mark.asyncio
528
+ async def test_get_response_does_not_retry_unrelated_value_error(
529
+ self, mock_anthropic: AsyncAnthropic
530
+ ) -> None:
531
+ """Non-tool-json ValueErrors should propagate immediately."""
532
+ unrelated_error = ValueError("stream exploded for unrelated reason")
533
+ mock_anthropic.beta.messages.stream = MagicMock(
534
+ return_value=MockErrorStreamContextManager(unrelated_error)
535
+ )
536
+
537
+ agent = ClaudeAgent.create(
538
+ model_client=mock_anthropic,
539
+ validate_api_key=False,
540
+ )
541
+ agent.claude_tools = []
542
+ agent.tool_mapping = {}
543
+ agent.has_computer_tool = False
544
+ agent._initialized = True
545
+
546
+ with pytest.raises(ValueError, match="unrelated reason"):
547
+ await agent.get_response([])
548
+
549
+ assert mock_anthropic.beta.messages.stream.call_count == 1
550
+
413
551
 
414
552
  class TestClaudeAgentBedrock:
415
553
  """Test ClaudeAgent class with Bedrock."""
@@ -50,42 +50,41 @@ def show_dev_server_info(
50
50
 
51
51
  # Server section
52
52
  hud_console.section_title("Server")
53
- hud_console.print(f"{hud_console.sym.ITEM} {escape(server_name)}")
53
+ hud_console.console.print(f"{hud_console.sym.ITEM} {escape(server_name)}", highlight=False)
54
+ _print = lambda msg: hud_console.console.print(msg, highlight=False)
54
55
  if transport == "http":
55
- hud_console.print(f"{hud_console.sym.ITEM} http://localhost:{port}/mcp")
56
+ _print(f"{hud_console.sym.ITEM} http://localhost:{port}/mcp")
56
57
  else:
57
- hud_console.print(f"{hud_console.sym.ITEM} (stdio)")
58
+ _print(f"{hud_console.sym.ITEM} (stdio)")
58
59
 
59
60
  # Quick Links (only for HTTP mode)
60
61
  if transport == "http":
61
62
  hud_console.section_title("Quick Links")
62
- hud_console.print(f"{hud_console.sym.ITEM} Docs: http://localhost:{port}/docs")
63
- hud_console.print(f"{hud_console.sym.ITEM} Cursor:")
63
+ _print(f"{hud_console.sym.ITEM} Docs: http://localhost:{port}/docs")
64
+ _print(f"{hud_console.sym.ITEM} Cursor:")
64
65
  # Display the Cursor link on its own line to prevent wrapping
65
66
  hud_console.link(cursor_deeplink)
66
67
 
67
68
  # Show eval endpoint if in Docker mode
68
69
  if docker_mode:
69
- hud_console.print(
70
- f"{hud_console.sym.ITEM} Eval API: http://localhost:{port}/eval (POST)"
71
- )
70
+ _print(f"{hud_console.sym.ITEM} Eval API: http://localhost:{port}/eval (POST)")
72
71
 
73
72
  # Show debugging URLs from telemetry
74
73
  if telemetry:
75
74
  if "live_url" in telemetry:
76
75
  url = escape(telemetry["live_url"])
77
- hud_console.print(f"{hud_console.sym.ITEM} Live URL: {url}")
76
+ _print(f"{hud_console.sym.ITEM} Live URL: {url}")
78
77
  if "vnc_url" in telemetry:
79
- hud_console.print(f"{hud_console.sym.ITEM} VNC URL: {escape(telemetry['vnc_url'])}")
78
+ _print(f"{hud_console.sym.ITEM} VNC URL: {escape(telemetry['vnc_url'])}")
80
79
  if "cdp_url" in telemetry:
81
- hud_console.print(f"{hud_console.sym.ITEM} CDP URL: {escape(telemetry['cdp_url'])}")
80
+ _print(f"{hud_console.sym.ITEM} CDP URL: {escape(telemetry['cdp_url'])}")
82
81
 
83
82
  # Check for VNC (browser environment)
84
83
  if env_dir and (env_dir / "environment" / "server.py").exists():
85
84
  try:
86
85
  content = (env_dir / "environment" / "server.py").read_text()
87
86
  if "x11vnc" in content.lower() or "vnc" in content.lower():
88
- hud_console.print(f"{hud_console.sym.ITEM} VNC: http://localhost:8080/vnc.html")
87
+ _print(f"{hud_console.sym.ITEM} VNC: http://localhost:8080/vnc.html")
89
88
  except Exception: # noqa: S110
90
89
  pass
91
90
 
@@ -96,7 +96,7 @@ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
96
96
  # max_steps = 10
97
97
  # group_size = 1
98
98
  # byok = false # Remote only; use encrypted env vars on the platform.
99
- # task_ids = ["task_1", "task_2"]
99
+ # task_ids = ["checkout-smoke", "0"] # slugs or 0-based indices
100
100
  # verbose = true
101
101
  # very_verbose = true
102
102
  # auto_respond = true
@@ -627,15 +627,18 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
627
627
  hud_console.error(f"No tasks found in: {cfg.source}")
628
628
  raise typer.Exit(1)
629
629
 
630
- # Filter by task IDs if provided
630
+ # Filter by task slugs (or positional indices) if provided
631
631
  if cfg.task_ids:
632
- id_set = set(cfg.task_ids)
633
- # Match by task.id or index
634
- filtered = [t for i, t in enumerate(tasks) if t.id in id_set or str(i) in id_set]
632
+ selector_set = set(cfg.task_ids)
633
+ filtered = []
634
+ for i, task in enumerate(tasks):
635
+ task_slug = getattr(task, "slug", None)
636
+ if (isinstance(task_slug, str) and task_slug in selector_set) or str(i) in selector_set:
637
+ filtered.append(task)
635
638
  if not filtered:
636
- hud_console.error(f"No tasks found matching IDs: {', '.join(cfg.task_ids)}")
639
+ hud_console.error(f"No tasks found matching slugs/indices: {', '.join(cfg.task_ids)}")
637
640
  raise typer.Exit(1)
638
- hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
641
+ hud_console.info(f"Filtered to {len(filtered)} task(s) by slug/index")
639
642
  tasks = filtered
640
643
  elif not cfg.all:
641
644
  # Single task mode (no --all, --full, or --task-ids)
@@ -687,33 +690,16 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
687
690
  sanitized[agent_name] = agent_settings
688
691
  eval_cfg_dict["agent_config"] = sanitized
689
692
 
690
- tasks_to_create = [t for t in tasks if cfg.taskset and not t.id]
691
- tasks_data = (
692
- [t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
693
- if tasks_to_create
694
- else None
695
- )
696
-
697
- ids = await _send_job_enter(
693
+ await _send_job_enter(
698
694
  job_id=job_id,
699
695
  name=f"eval ({cfg.source})" if cfg.source else "eval",
700
696
  variants=None,
701
697
  group=cfg.group_size,
702
698
  api_key=None,
703
699
  taskset=cfg.taskset,
704
- tasks=tasks_data,
705
700
  hud_eval_config=eval_cfg_dict,
706
701
  )
707
702
 
708
- if cfg.taskset and ids:
709
- if len(ids) != len(tasks_to_create):
710
- hud_console.warning(
711
- f"Task count mismatch: sent {len(tasks_to_create)} tasks, "
712
- f"received {len(ids)} IDs. Some tasks may not be linked."
713
- )
714
- for task_obj, task_version_id in zip(tasks_to_create, ids, strict=False):
715
- task_obj.id = task_version_id
716
-
717
703
  trace_ids = await submit_rollouts(
718
704
  tasks=tasks,
719
705
  job_id=job_id,
@@ -809,7 +795,11 @@ def eval_command(
809
795
  help="Automatically prompt the agent to continue if it does not respond with a tool call",
810
796
  ),
811
797
  group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
812
- task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
798
+ task_ids: str | None = typer.Option(
799
+ None,
800
+ "--task-ids",
801
+ help="Comma-separated task slugs (or 0-based indices) to run",
802
+ ),
813
803
  yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
814
804
  remote: bool = typer.Option(
815
805
  False, "--remote", help="Submit tasks to platform for remote execution"
@@ -138,8 +138,9 @@ def show_dev_ui(
138
138
  # Show other info below
139
139
  label = "Base image" if is_docker else "Server"
140
140
  hud_console.info("")
141
- hud_console.print(f"{hud_console.sym.ITEM} {escape(label)}: {escape(server_name)}")
142
- hud_console.print(f"{hud_console.sym.ITEM} Cursor:")
141
+ _print = lambda msg: hud_console.console.print(msg, highlight=False)
142
+ _print(f"{hud_console.sym.ITEM} {escape(label)}: {escape(server_name)}")
143
+ _print(f"{hud_console.sym.ITEM} Cursor:")
143
144
  # Display the Cursor link on its own line to prevent wrapping
144
145
  hud_console.link(cursor_deeplink)
145
146
  hud_console.info("")
@@ -5,6 +5,7 @@ from __future__ import annotations
5
5
  import shutil
6
6
 
7
7
  import typer
8
+ from rich.markup import escape
8
9
 
9
10
  from hud.utils.hud_console import HUDConsole
10
11
 
@@ -91,8 +92,8 @@ def remove_environment(
91
92
  if image:
92
93
  hud_console.info("")
93
94
  hud_console.info("Note: The Docker image may still exist locally.")
94
- hud_console.info(
95
- f"To remove it, run: [cyan]docker rmi {image.split('@')[0]}[/cyan]"
95
+ hud_console.print(
96
+ f"To remove it, run: [cyan]docker rmi {escape(image.split('@')[0])}[/cyan]"
96
97
  )
97
98
  except Exception as e:
98
99
  hud_console.error(f"Failed to remove environment: {e}")
@@ -60,12 +60,12 @@ class TestIncrementVersion:
60
60
  def test_increment_minor(self):
61
61
  """Test incrementing minor version."""
62
62
  assert increment_version("1.2.3", "minor") == "1.3.0"
63
- assert increment_version("0.5.25", "minor") == "0.6.0"
63
+ assert increment_version("0.5.27", "minor") == "0.6.0"
64
64
 
65
65
  def test_increment_major(self):
66
66
  """Test incrementing major version."""
67
67
  assert increment_version("1.2.3", "major") == "2.0.0"
68
- assert increment_version("0.5.25", "major") == "1.0.0"
68
+ assert increment_version("0.5.27", "major") == "1.0.0"
69
69
 
70
70
  def test_increment_with_v_prefix(self):
71
71
  """Test incrementing version with v prefix."""
@@ -433,7 +433,7 @@ class InteractiveMCPTester:
433
433
  # Show next steps tutorial
434
434
  self.console.section_title("Next Steps")
435
435
  self.console.info("🏗️ Ready to test with real agents? Run:")
436
- self.console.info(" [cyan]hud build[/cyan]")
436
+ self.console.print(" [cyan]hud build[/cyan]")
437
437
  self.console.info("")
438
438
  self.console.info("This will:")
439
439
  self.console.info(" 1. Build your environment image")
@@ -441,8 +441,10 @@ class InteractiveMCPTester:
441
441
  self.console.info(" 3. Prepare it for testing with agents")
442
442
  self.console.info("")
443
443
  self.console.info("Then you can:")
444
- self.console.info(" • Test locally: [cyan]hud run <image>[/cyan]")
445
- self.console.info(" • Push to registry: [cyan]hud push --image <registry/name>[/cyan]")
444
+ self.console.print(" • Test locally: [cyan]hud run <image>[/cyan]")
445
+ self.console.print(
446
+ " • Push to registry: [cyan]hud push --image <registry/name>[/cyan]"
447
+ )
446
448
  self.console.info(" • Use with agents via the lock file")
447
449
 
448
450
  console.print("\n[dim]Happy testing! 🎉[/dim]")
@@ -26,6 +26,7 @@ from typing import NamedTuple
26
26
 
27
27
  import httpx
28
28
  from packaging import version
29
+ from rich.markup import escape
29
30
 
30
31
  from hud.utils.hud_console import HUDConsole
31
32
 
@@ -241,16 +242,12 @@ def display_update_prompt(console: HUDConsole | None = None) -> None:
241
242
  else:
242
243
  upgrade_cmd = "uv tool upgrade hud-python"
243
244
 
244
- # Create update message
245
- update_msg = (
246
- f"🆕 A new version of hud-python is available: "
247
- f"[bold cyan]{info.latest}[/bold cyan] "
248
- f"(current: [dim]{info.current}[/dim])\n"
249
- f" Run: [bold yellow]{upgrade_cmd}[/bold yellow] to update"
245
+ console.print(
246
+ f"[yellow]🆕 A new version of hud-python is available: "
247
+ f"[bold cyan]{escape(info.latest)}[/bold cyan] "
248
+ f"(current: [dim]{escape(info.current)}[/dim])\n"
249
+ f" Run: [bold yellow]{escape(upgrade_cmd)}[/bold yellow] to update[/yellow]"
250
250
  )
251
-
252
- # Display using console info
253
- console.info(f"[yellow]{update_msg}[/yellow]")
254
251
  except Exception: # noqa: S110
255
252
  # Never let version checking disrupt the user's workflow
256
253
  pass
@@ -110,6 +110,8 @@ def _load_from_huggingface(dataset_name: str) -> list[Task]:
110
110
 
111
111
  def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
112
112
  """Load raw task dicts from HUD API."""
113
+ from hud.datasets.utils import _normalize_task_dict
114
+
113
115
  headers = {}
114
116
  if settings.api_key:
115
117
  headers["Authorization"] = f"Bearer {settings.api_key}"
@@ -126,13 +128,11 @@ def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
126
128
  # Extract tasks dict from response
127
129
  tasks_dict = data.get("tasks", {})
128
130
 
129
- raw_items: list[dict[str, Any]] = []
130
- for task_id, task_data in tasks_dict.items():
131
- if task_data.get("id") is None:
132
- task_data["id"] = task_id
133
- raw_items.append(task_data)
134
-
135
- return raw_items
131
+ return [
132
+ _normalize_task_dict(task_data)
133
+ for task_data in tasks_dict.values()
134
+ if isinstance(task_data, dict)
135
+ ]
136
136
 
137
137
 
138
138
  def _load_from_api(dataset_name: str) -> list[Task]:
@@ -282,8 +282,13 @@ def save_tasks(
282
282
  "Use Task.from_v4(legacy_task) to convert from LegacyTask."
283
283
  )
284
284
 
285
- # Convert tasks to dicts (Task is a Pydantic model)
286
- task_dicts = [task.model_dump(mode="json", exclude_none=True) for task in tasks]
285
+ # Convert tasks to dicts (Task is a Pydantic model).
286
+ # id is internal/platform-assigned; uploads should identify via slug.
287
+ task_dicts: list[dict[str, Any]] = []
288
+ for task in tasks:
289
+ task_data = task.model_dump(mode="json", exclude_none=True)
290
+ task_data.pop("id", None)
291
+ task_dicts.append(task_data)
287
292
 
288
293
  # Build request payload
289
294
  payload: dict[str, Any] = {
@@ -296,7 +301,7 @@ def save_tasks(
296
301
  try:
297
302
  with httpx.Client(timeout=60) as client:
298
303
  response = client.post(
299
- f"{settings.hud_api_url}/tasks/evalset",
304
+ f"{settings.hud_api_url}/tasks/upload",
300
305
  json=payload,
301
306
  headers=headers,
302
307
  )
@@ -187,7 +187,7 @@ async def run_single_task(
187
187
  ```
188
188
  """
189
189
  # Determine trace name
190
- effective_trace_name = trace_name or task_id or task.id or "single_task"
190
+ effective_trace_name = trace_name or task_id or task.slug or "single_task"
191
191
 
192
192
  # Run with explicit eval context parameters
193
193
  async with hud.eval(