hud-python 0.4.54__tar.gz → 0.4.56__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (303) hide show
  1. {hud_python-0.4.54 → hud_python-0.4.56}/PKG-INFO +1 -1
  2. hud_python-0.4.56/environments/rubrics/README.md +182 -0
  3. hud_python-0.4.56/environments/rubrics/environment/pyproject.toml +18 -0
  4. hud_python-0.4.56/environments/rubrics/pyproject.toml +19 -0
  5. hud_python-0.4.56/environments/rubrics/server/pyproject.toml +19 -0
  6. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/base.py +8 -0
  7. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/claude.py +4 -3
  8. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/openai.py +2 -1
  9. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/openai_chat_generic.py +3 -2
  10. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_claude.py +2 -2
  11. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_openai.py +1 -1
  12. hud_python-0.4.56/hud/agents/utils.py +50 -0
  13. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/__init__.py +52 -1
  14. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/build.py +185 -25
  15. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/dev.py +129 -39
  16. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/eval.py +99 -1
  17. hud_python-0.4.56/hud/cli/flows/dev.py +155 -0
  18. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/flows/tasks.py +29 -9
  19. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/init.py +3 -1
  20. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/docker.py +6 -3
  21. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/base.py +2 -2
  22. {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/context.py +42 -1
  23. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/server.py +29 -3
  24. {hud_python-0.4.54 → hud_python-0.4.56}/hud/settings.py +6 -0
  25. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/async_context.py +16 -2
  26. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/trace.py +6 -1
  27. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/group_eval.py +14 -2
  28. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_agent_factories.py +2 -1
  29. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_version.py +1 -1
  30. {hud_python-0.4.54 → hud_python-0.4.56}/hud/version.py +1 -1
  31. {hud_python-0.4.54 → hud_python-0.4.56}/pyproject.toml +1 -1
  32. {hud_python-0.4.54 → hud_python-0.4.56}/.gitignore +0 -0
  33. {hud_python-0.4.54 → hud_python-0.4.56}/LICENSE +0 -0
  34. {hud_python-0.4.54 → hud_python-0.4.56}/README.md +0 -0
  35. {hud_python-0.4.54 → hud_python-0.4.56}/environments/README.md +0 -0
  36. {hud_python-0.4.54 → hud_python-0.4.56}/environments/blank/README.md +0 -0
  37. {hud_python-0.4.54 → hud_python-0.4.56}/environments/blank/environment/README.md +0 -0
  38. {hud_python-0.4.54 → hud_python-0.4.56}/environments/blank/environment/pyproject.toml +0 -0
  39. {hud_python-0.4.54 → hud_python-0.4.56}/environments/blank/server/README.md +0 -0
  40. {hud_python-0.4.54 → hud_python-0.4.56}/environments/blank/server/pyproject.toml +0 -0
  41. {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/README.md +0 -0
  42. {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/2048/README.md +0 -0
  43. {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/2048/backend/pyproject.toml +0 -0
  44. {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/README.md +0 -0
  45. {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/pyproject.toml +0 -0
  46. {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/todo/README.md +0 -0
  47. {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/environment/todo/backend/pyproject.toml +0 -0
  48. {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/pyproject.toml +0 -0
  49. {hud_python-0.4.54 → hud_python-0.4.56}/environments/browser/server/pyproject.toml +0 -0
  50. {hud_python-0.4.54 → hud_python-0.4.56}/environments/deepresearch/README.md +0 -0
  51. {hud_python-0.4.54 → hud_python-0.4.56}/environments/deepresearch/environment/pyproject.toml +0 -0
  52. {hud_python-0.4.54 → hud_python-0.4.56}/environments/deepresearch/pyproject.toml +0 -0
  53. {hud_python-0.4.54 → hud_python-0.4.56}/environments/deepresearch/server/pyproject.toml +0 -0
  54. {hud_python-0.4.54 → hud_python-0.4.56}/environments/remote_browser/README.md +0 -0
  55. {hud_python-0.4.54 → hud_python-0.4.56}/environments/remote_browser/pyproject.toml +0 -0
  56. {hud_python-0.4.54 → hud_python-0.4.56}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
  57. {hud_python-0.4.54 → hud_python-0.4.56}/environments/text_2048/README.md +0 -0
  58. {hud_python-0.4.54 → hud_python-0.4.56}/environments/text_2048/pyproject.toml +0 -0
  59. {hud_python-0.4.54 → hud_python-0.4.56}/examples/README.md +0 -0
  60. {hud_python-0.4.54 → hud_python-0.4.56}/hud/__init__.py +0 -0
  61. {hud_python-0.4.54 → hud_python-0.4.56}/hud/__main__.py +0 -0
  62. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/__init__.py +0 -0
  63. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/grounded_openai.py +0 -0
  64. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/langchain.py +0 -0
  65. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/lite_llm.py +0 -0
  66. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/misc/__init__.py +0 -0
  67. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/misc/integration_test_agent.py +0 -0
  68. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/misc/response_agent.py +0 -0
  69. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/__init__.py +0 -0
  70. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_base.py +0 -0
  71. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_base_runtime.py +0 -0
  72. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_client.py +0 -0
  73. {hud_python-0.4.54 → hud_python-0.4.56}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  74. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/__main__.py +0 -0
  75. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/analyze.py +0 -0
  76. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/clone.py +0 -0
  77. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/debug.py +0 -0
  78. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/flows/__init__.py +0 -0
  79. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/get.py +0 -0
  80. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/list_func.py +0 -0
  81. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/pull.py +0 -0
  82. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/push.py +0 -0
  83. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/remove.py +0 -0
  84. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/__init__.py +0 -0
  85. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/celebrate.py +0 -0
  86. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/config.py +0 -0
  87. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/display.py +0 -0
  88. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/gpu.py +0 -0
  89. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/gpu_utils.py +0 -0
  90. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/local_runner.py +0 -0
  91. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/presets.py +0 -0
  92. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/remote_runner.py +0 -0
  93. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/rl_api.py +0 -0
  94. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/viewer.py +0 -0
  95. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/vllm.py +0 -0
  96. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/rl/wait_utils.py +0 -0
  97. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/__init__.py +0 -0
  98. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_analyze.py +0 -0
  99. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_analyze_metadata.py +0 -0
  100. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_analyze_module.py +0 -0
  101. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_build.py +0 -0
  102. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_build_failure.py +0 -0
  103. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_build_module.py +0 -0
  104. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_cli_init.py +0 -0
  105. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_cli_main.py +0 -0
  106. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
  107. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_cli_root.py +0 -0
  108. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_clone.py +0 -0
  109. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_cursor.py +0 -0
  110. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_debug.py +0 -0
  111. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_eval.py +0 -0
  112. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_list_func.py +0 -0
  113. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_main_module.py +0 -0
  114. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_mcp_server.py +0 -0
  115. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_pull.py +0 -0
  116. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_push.py +0 -0
  117. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_push_happy.py +0 -0
  118. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_push_wrapper.py +0 -0
  119. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_registry.py +0 -0
  120. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/tests/test_utils.py +0 -0
  121. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/__init__.py +0 -0
  122. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/config.py +0 -0
  123. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/cursor.py +0 -0
  124. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/env_check.py +0 -0
  125. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/environment.py +0 -0
  126. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/interactive.py +0 -0
  127. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/local_runner.py +0 -0
  128. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/logging.py +0 -0
  129. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/metadata.py +0 -0
  130. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/package_runner.py +0 -0
  131. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/registry.py +0 -0
  132. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/remote_runner.py +0 -0
  133. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/runner.py +0 -0
  134. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/server.py +0 -0
  135. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/source_hash.py +0 -0
  136. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tasks.py +0 -0
  137. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/__init__.py +0 -0
  138. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_config.py +0 -0
  139. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_docker.py +0 -0
  140. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_docker_hints.py +0 -0
  141. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_env_check.py +0 -0
  142. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_environment.py +0 -0
  143. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_interactive_module.py +0 -0
  144. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_local_runner.py +0 -0
  145. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_logging_utils.py +0 -0
  146. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_metadata.py +0 -0
  147. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_package_runner.py +0 -0
  148. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_registry_utils.py +0 -0
  149. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_remote_runner.py +0 -0
  150. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_runner_modules.py +0 -0
  151. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_source_hash.py +0 -0
  152. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/tests/test_tasks.py +0 -0
  153. {hud_python-0.4.54 → hud_python-0.4.56}/hud/cli/utils/version_check.py +0 -0
  154. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/README.md +0 -0
  155. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/__init__.py +0 -0
  156. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/fastmcp.py +0 -0
  157. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/mcp_use.py +0 -0
  158. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/tests/__init__.py +0 -0
  159. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/tests/test_client_integration.py +0 -0
  160. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/tests/test_fastmcp.py +0 -0
  161. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  162. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/tests/test_protocol.py +0 -0
  163. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/utils/__init__.py +0 -0
  164. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/utils/mcp_use_retry.py +0 -0
  165. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/utils/retry.py +0 -0
  166. {hud_python-0.4.54 → hud_python-0.4.56}/hud/clients/utils/retry_transport.py +0 -0
  167. {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/__init__.py +0 -0
  168. {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/parallel.py +0 -0
  169. {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/runner.py +0 -0
  170. {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/tests/__init__.py +0 -0
  171. {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/tests/test_runner.py +0 -0
  172. {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/tests/test_utils.py +0 -0
  173. {hud_python-0.4.54 → hud_python-0.4.56}/hud/datasets/utils.py +0 -0
  174. {hud_python-0.4.54 → hud_python-0.4.56}/hud/misc/__init__.py +0 -0
  175. {hud_python-0.4.54 → hud_python-0.4.56}/hud/misc/claude_plays_pokemon.py +0 -0
  176. {hud_python-0.4.54 → hud_python-0.4.56}/hud/native/__init__.py +0 -0
  177. {hud_python-0.4.54 → hud_python-0.4.56}/hud/native/comparator.py +0 -0
  178. {hud_python-0.4.54 → hud_python-0.4.56}/hud/native/tests/__init__.py +0 -0
  179. {hud_python-0.4.54 → hud_python-0.4.56}/hud/native/tests/test_comparator.py +0 -0
  180. {hud_python-0.4.54 → hud_python-0.4.56}/hud/native/tests/test_native_init.py +0 -0
  181. {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/__init__.py +0 -0
  182. {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/collector.py +0 -0
  183. {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/config.py +0 -0
  184. {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/exporters.py +0 -0
  185. {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/instrumentation.py +0 -0
  186. {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/processors.py +0 -0
  187. {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/tests/__init__.py +0 -0
  188. {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/tests/test_instrumentation.py +0 -0
  189. {hud_python-0.4.54 → hud_python-0.4.56}/hud/otel/tests/test_processors.py +0 -0
  190. {hud_python-0.4.54 → hud_python-0.4.56}/hud/py.typed +0 -0
  191. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/README.md +0 -0
  192. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/__init__.py +0 -0
  193. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/actor.py +0 -0
  194. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/buffer.py +0 -0
  195. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/chat_template.jinja +0 -0
  196. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/config.py +0 -0
  197. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/distributed.py +0 -0
  198. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/learner.py +0 -0
  199. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/tests/__init__.py +0 -0
  200. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/tests/test_learner.py +0 -0
  201. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/train.py +0 -0
  202. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/types.py +0 -0
  203. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/utils/start_vllm_server.sh +0 -0
  204. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/utils.py +0 -0
  205. {hud_python-0.4.54 → hud_python-0.4.56}/hud/rl/vllm_adapter.py +0 -0
  206. {hud_python-0.4.54 → hud_python-0.4.56}/hud/samples/__init__.py +0 -0
  207. {hud_python-0.4.54 → hud_python-0.4.56}/hud/samples/browser.py +0 -0
  208. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/__init__.py +0 -0
  209. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/context.py +0 -0
  210. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/helper/__init__.py +0 -0
  211. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/low_level.py +0 -0
  212. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/router.py +0 -0
  213. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/__init__.py +0 -0
  214. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_add_tool.py +0 -0
  215. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_context.py +0 -0
  216. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_mcp_server_handlers.py +0 -0
  217. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_mcp_server_integration.py +0 -0
  218. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_mcp_server_more.py +0 -0
  219. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_run_wrapper.py +0 -0
  220. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_server_extra.py +0 -0
  221. {hud_python-0.4.54 → hud_python-0.4.56}/hud/server/tests/test_sigterm_runner.py +0 -0
  222. {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/__init__.py +0 -0
  223. {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/exceptions.py +0 -0
  224. {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/hints.py +0 -0
  225. {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/requests.py +0 -0
  226. {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/tests/__init__.py +0 -0
  227. {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/tests/test_exceptions.py +0 -0
  228. {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/tests/test_hints.py +0 -0
  229. {hud_python-0.4.54 → hud_python-0.4.56}/hud/shared/tests/test_requests.py +0 -0
  230. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/__init__.py +0 -0
  231. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/instrument.py +0 -0
  232. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/job.py +0 -0
  233. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/replay.py +0 -0
  234. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/__init__.py +0 -0
  235. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/test_async_context.py +0 -0
  236. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/test_instrument.py +0 -0
  237. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/test_job.py +0 -0
  238. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/test_replay.py +0 -0
  239. {hud_python-0.4.54 → hud_python-0.4.56}/hud/telemetry/tests/test_trace.py +0 -0
  240. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/__init__.py +0 -0
  241. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/base.py +0 -0
  242. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/bash.py +0 -0
  243. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/__init__.py +0 -0
  244. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/anthropic.py +0 -0
  245. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/hud.py +0 -0
  246. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/openai.py +0 -0
  247. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/qwen.py +0 -0
  248. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/computer/settings.py +0 -0
  249. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/edit.py +0 -0
  250. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/__init__.py +0 -0
  251. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/base.py +0 -0
  252. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/pyautogui.py +0 -0
  253. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/tests/__init__.py +0 -0
  254. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/tests/test_base_executor.py +0 -0
  255. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  256. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/executors/xdo.py +0 -0
  257. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/__init__.py +0 -0
  258. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/config.py +0 -0
  259. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/grounded_tool.py +0 -0
  260. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/grounder.py +0 -0
  261. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/tests/__init__.py +0 -0
  262. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  263. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/playwright.py +0 -0
  264. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/response.py +0 -0
  265. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/submit.py +0 -0
  266. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/__init__.py +0 -0
  267. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_base.py +0 -0
  268. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_bash.py +0 -0
  269. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_bash_extended.py +0 -0
  270. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_computer.py +0 -0
  271. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_computer_actions.py +0 -0
  272. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_edit.py +0 -0
  273. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_init.py +0 -0
  274. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_playwright_tool.py +0 -0
  275. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_response.py +0 -0
  276. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_submit.py +0 -0
  277. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_tools.py +0 -0
  278. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_tools_init.py +0 -0
  279. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_types.py +0 -0
  280. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/tests/test_utils.py +0 -0
  281. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/types.py +0 -0
  282. {hud_python-0.4.54 → hud_python-0.4.56}/hud/tools/utils.py +0 -0
  283. {hud_python-0.4.54 → hud_python-0.4.56}/hud/types.py +0 -0
  284. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/__init__.py +0 -0
  285. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/agent_factories.py +0 -0
  286. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/async_utils.py +0 -0
  287. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/hud_console.py +0 -0
  288. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/mcp.py +0 -0
  289. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/pretty_errors.py +0 -0
  290. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/progress.py +0 -0
  291. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/task_tracking.py +0 -0
  292. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tasks.py +0 -0
  293. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/telemetry.py +0 -0
  294. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/__init__.py +0 -0
  295. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_async_utils.py +0 -0
  296. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_init.py +0 -0
  297. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_mcp.py +0 -0
  298. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_pretty_errors.py +0 -0
  299. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_progress.py +0 -0
  300. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_tasks.py +0 -0
  301. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_telemetry.py +0 -0
  302. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tests/test_tool_shorthand.py +0 -0
  303. {hud_python-0.4.54 → hud_python-0.4.56}/hud/utils/tool_shorthand.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.54
3
+ Version: 0.4.56
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -0,0 +1,182 @@
1
+ # Rubrics Environment
2
+
3
+ Web research environment powered by Exa API for searching and fetching content, with rubric-based evaluation for structured grading.
4
+ See [docs](https://docs.hud.so/build-environments) for the complete environment design workflow.
5
+
6
+ ## Architecture
7
+
8
+ **`environment/`** - Manages Exa API integration and state
9
+ - Holds the Exa API key server-side
10
+ - Exposes HTTP endpoints `/search`, `/fetch`, `/answer`, `/evaluate` for research workflows
11
+ - Implements exponential backoff for rate limiting
12
+
13
+ **`server/`** - Wraps data in MCP tools
14
+ - Provides `search()`, `fetch()`, `answer()`, `evaluate()` tools for agents
15
+ - Agents and tasks interact only with these tools
16
+
17
+ **Why separate?** Edit tools for the agent or tasks without restarting the environment backend.
18
+
19
+ ## Tools
20
+
21
+ - **`search(query: str)`** - Search the web using Exa API, returns list of results with titles and URLs
22
+ - **`fetch(url: str)`** - Fetch full content from a URL, returns summary, highlights, and text
23
+ - **`answer(final_answer: str)`** - Submit the final research answer
24
+ - **`evaluate(rubric: list[dict])`** - Evaluate submitted answer using a structured rubric with weighted requirements
25
+
26
+ ### Rubric-Based Evaluation
27
+
28
+ The `evaluate` tool uses The LLM Data Company's [rubric](https://github.com/The-LLM-Data-Company/rubric/) package to grade answers against structured criteria with autograders.
29
+
30
+ ## Setup
31
+
32
+ ### Requirements
33
+ - Exa API key (get one at [exa.ai](https://exa.ai))
34
+
35
+ ### Environment Variables
36
+ ```bash
37
+ export EXA_API_KEY="your_exa_api_key_here"
38
+ ```
39
+
40
+ ## Development
41
+
42
+ ```bash
43
+ # Terminal 1 - Environment backend
44
+ cd environment
45
+ export EXA_API_KEY="your_key"
46
+ uv run uvicorn server:app --reload
47
+
48
+ # Terminal 2 - MCP server
49
+ cd server
50
+ uv run hud dev
51
+ ```
52
+
53
+ The environment includes exponential backoff for rate limiting, so API calls will automatically retry on 429 errors.
54
+
55
+ In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
56
+
57
+ For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
58
+ ```bash
59
+ cd ..
60
+ export EXA_API_KEY="your_key"
61
+ hud dev
62
+ ```
63
+
64
+ ## Tasks & Evaluation
65
+
66
+ ```bash
67
+ # Build first in the global folder with the Dockerfile (creates rubrics:0.1.0)
68
+ hud build
69
+ ```
70
+
71
+ Your `tasks.json` uses `docker run` to launch the environment:
72
+
73
+ ```json
74
+ {
75
+ "prompt": "Research and answer: What is the capital of France?",
76
+ "mcp_config": {
77
+ "local": {
78
+ "command": "docker",
79
+ "args": ["run", "--rm", "-i", "-e", "EXA_API_KEY", "rubrics:latest"]
80
+ }
81
+ },
82
+ "evaluate_tool": {
83
+ "name": "evaluate",
84
+ "arguments": {
85
+ "rubric": [
86
+ {
87
+ "requirement": "Correctly identifies Paris as the capital of France",
88
+ "weight": 5
89
+ },
90
+ {
91
+ "requirement": "Provides additional context about Paris (population, history, or geography)",
92
+ "weight": 10
93
+ }
94
+ ]
95
+ }
96
+ }
97
+ }
98
+ ```
99
+
100
+ **Note:** The `-e EXA_API_KEY` flag passes your local API key to the container.
101
+
102
+ **Commands:**
103
+ ```bash
104
+ # Build first
105
+ hud build
106
+
107
+ # Test task locally
108
+ export EXA_API_KEY="your_key"
109
+ hud eval tasks.json
110
+
111
+ # Push environment for remote running
112
+ hud push
113
+
114
+ # Production RL training
115
+ hud rl tasks.json # Auto-converts docker→remote, builds & pushes if needed
116
+ ```
117
+
118
+ ## Publishing Your Environment
119
+
120
+ Once your environment is ready, you can share it with the community:
121
+
122
+ ### 1. Push to Registry
123
+ ```bash
124
+ # Build and push your environment (requires docker hub login and hud api key)
125
+ hud build
126
+ hud push
127
+ ```
128
+
129
+ ### 2. Create a Dataset
130
+
131
+ Create a dataset on HuggingFace with your tasks:
132
+
133
+ **Option A: Upload manually**
134
+ 1. Upload your `tasks.json` to HuggingFace
135
+ 2. Make sure it's **public** to appear on leaderboards
136
+
137
+ **Option B: Use the SDK**
138
+ ```python
139
+ from hud.datasets import save_tasks
140
+ import json
141
+
142
+ # Load your tasks
143
+ with open("tasks.json") as f:
144
+ tasks = json.load(f)
145
+
146
+ # Push to HuggingFace
147
+ save_tasks(tasks, repo_id="your-org/your-dataset")
148
+ ```
149
+
150
+ ### 3. Run and Track Performance
151
+
152
+ ```bash
153
+ # Run Claude on your benchmark
154
+ hud eval "your-org/your-dataset" --agent claude
155
+
156
+ # View results at:
157
+ # hud.so/leaderboards/your-org/your-dataset
158
+ ```
159
+
160
+ **Note**: Only public HuggingFace datasets appear as leaderboards!
161
+
162
+ 📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
163
+
164
+ ## Example Research Workflow
165
+
166
+ ```python
167
+ # Agent searches for information
168
+ results = search("latest AI developments 2024")
169
+
170
+ # Agent fetches detailed content from top result
171
+ content = fetch(results[0]["url"])
172
+
173
+ # Agent submits final answer
174
+ answer("Based on research, AI developments in 2024 include...")
175
+
176
+ # Evaluate answer using rubric
177
+ result = evaluate(rubric=[
178
+ {"requirement": "Mentions at least 3 specific AI developments", "weight": 15},
179
+ {"requirement": "Includes dates or timeframes for developments", "weight": 5},
180
+ ])
181
+ # Returns: {"reward": float, "info": {"report": [...]}, "done": True}
182
+ ```
@@ -0,0 +1,18 @@
1
+ [project]
2
+ name = "rubrics-environment"
3
+ version = "0.1.0"
4
+ description = "Backend service for Rubrics environment"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "fastapi>=0.104.1",
8
+ "uvicorn[standard]>=0.24.0",
9
+ "httpx>=0.24.0",
10
+ "rubric>=1.1.7",
11
+ ]
12
+
13
+ [build-system]
14
+ requires = ["hatchling"]
15
+ build-backend = "hatchling.build"
16
+
17
+ [tool.hatch.build.targets.wheel]
18
+ packages = ["environment"]
@@ -0,0 +1,19 @@
1
+ [project]
2
+ name = "rubrics"
3
+ version = "0.1.0"
4
+ description = "Rubrics HUD environment with HTTP backend (EXA on server)"
5
+ requires-python = ">=3.11"
6
+ dependencies = [ "hud-python==0.4.42", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "httpx>=0.24.0",]
7
+
8
+ [build-system]
9
+ requires = [ "hatchling",]
10
+ build-backend = "hatchling.build"
11
+
12
+ [tool.hud]
13
+ image = "rubrics:dev"
14
+
15
+ [tool.hatch.metadata]
16
+ allow-direct-references = true
17
+
18
+ [tool.hatch.build.targets.wheel]
19
+ packages = [ "controller", "environment",]
@@ -0,0 +1,19 @@
1
+ [project]
2
+ name = "rubrics-mcp"
3
+ version = "0.1.0"
4
+ description = "MCP server for Rubrics environment"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "hud-python>=0.4.54",
8
+ "httpx>=0.24.0",
9
+ ]
10
+
11
+ [build-system]
12
+ requires = ["hatchling"]
13
+ build-backend = "hatchling.build"
14
+
15
+ [tool.hatch.metadata]
16
+ allow-direct-references = true
17
+
18
+ [tool.hatch.build.targets.wheel]
19
+ packages = ["mcp"]
@@ -11,6 +11,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal
11
11
 
12
12
  import mcp.types as types
13
13
 
14
+ from hud.agents.utils import log_agent_metadata_to_status, log_task_config_to_current_trace
14
15
  from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
15
16
  from hud.utils.hud_console import HUDConsole
16
17
  from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
@@ -62,6 +63,7 @@ class MCPAgent(ABC):
62
63
  initial_screenshot: bool = True,
63
64
  # Misc
64
65
  model_name: str = "mcp-agent",
66
+ checkpoint_name: str | None = None,
65
67
  response_agent: ResponseAgent | None = None,
66
68
  auto_trace: bool = True,
67
69
  verbose: bool = False,
@@ -92,6 +94,7 @@ class MCPAgent(ABC):
92
94
  self._auto_created_client = False # Track if we created the client
93
95
 
94
96
  self.model_name = model_name
97
+ self.checkpoint_name = checkpoint_name
95
98
  self.console = HUDConsole(logger=logger)
96
99
 
97
100
  # Set verbose mode if requested
@@ -198,6 +201,8 @@ class MCPAgent(ABC):
198
201
  f"Agent initialized with {len(self.get_available_tools())} tools: {', '.join([t.name for t in self.get_available_tools()])}" # noqa: E501
199
202
  )
200
203
 
204
+ await log_agent_metadata_to_status(self.model_name, self.checkpoint_name)
205
+
201
206
  async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
202
207
  """
203
208
  Run the agent with the given prompt or task.
@@ -223,6 +228,9 @@ class MCPAgent(ABC):
223
228
 
224
229
  # Handle Task objects with full lifecycle
225
230
  if isinstance(prompt_or_task, Task):
231
+ # Log a compact summary of task config to the current trace (async)
232
+ await log_task_config_to_current_trace(prompt_or_task)
233
+
226
234
  return await self.run_task(prompt_or_task, max_steps)
227
235
 
228
236
  # Handle simple string prompts
@@ -89,7 +89,8 @@ class ClaudeAgent(MCPAgent):
89
89
  self.use_computer_beta = use_computer_beta
90
90
  self.hud_console = HUDConsole(logger=logger)
91
91
 
92
- self.model_name = self.model
92
+ self.model_name = "Claude"
93
+ self.checkpoint_name = self.model
93
94
 
94
95
  # Track mapping from Claude tool names to MCP tool names
95
96
  self._claude_to_mcp_tool_map: dict[str, str] = {}
@@ -98,14 +99,14 @@ class ClaudeAgent(MCPAgent):
98
99
  # Append Claude-specific instructions to the base system prompt
99
100
  claude_instructions = """
100
101
  You are Claude, an AI assistant created by Anthropic. You are helpful, harmless, and honest.
101
-
102
+
102
103
  When working on tasks:
103
104
  1. Be thorough and systematic in your approach
104
105
  2. Complete tasks autonomously without asking for confirmation
105
106
  3. Use available tools efficiently to accomplish your goals
106
107
  4. Verify your actions and ensure task completion
107
108
  5. Be precise and accurate in all operations
108
-
109
+
109
110
  Remember: You are expected to complete tasks autonomously. The user trusts you to accomplish what they asked.
110
111
  """.strip() # noqa: E501
111
112
 
@@ -70,6 +70,7 @@ class OperatorAgent(MCPAgent):
70
70
 
71
71
  self.openai_client = model_client
72
72
  self.model = model
73
+ self.checkpoint_name = self.model
73
74
  self.environment = environment
74
75
 
75
76
  # State tracking for OpenAI's stateful API
@@ -84,7 +85,7 @@ class OperatorAgent(MCPAgent):
84
85
  except Exception as e:
85
86
  raise ValueError(f"OpenAI API key is invalid: {e}") from e
86
87
 
87
- self.model_name = "openai-" + self.model
88
+ self.model_name = "Operator"
88
89
 
89
90
  # Append OpenAI-specific instructions to the base system prompt
90
91
  openai_instructions = """
@@ -62,7 +62,8 @@ class GenericOpenAIChatAgent(MCPAgent):
62
62
  else:
63
63
  raise ValueError("Either openai_client or (api_key and base_url) must be provided")
64
64
 
65
- self.model_name = model_name
65
+ self.model_name = "GenericOpenAI"
66
+ self.checkpoint_name = model_name
66
67
  self.completion_kwargs: dict[str, Any] = completion_kwargs or {}
67
68
  self.mcp_schemas = []
68
69
  self.hud_console = HUDConsole(logger=logger)
@@ -194,7 +195,7 @@ class GenericOpenAIChatAgent(MCPAgent):
194
195
  raise ValueError("openai_client is required for GenericOpenAIChatAgent")
195
196
  # default transport = OpenAI SDK
196
197
  return await self.oai.chat.completions.create(
197
- model=self.model_name,
198
+ model=self.checkpoint_name,
198
199
  messages=messages,
199
200
  tools=tools, # type: ignore ready ChatCompletionToolParam-shaped
200
201
  **extra,
@@ -89,7 +89,7 @@ class TestClaudeAgent:
89
89
  validate_api_key=False, # Skip validation in tests
90
90
  )
91
91
 
92
- assert agent.model_name == "claude-3-opus-20240229"
92
+ assert agent.model_name == "Claude"
93
93
  assert agent.max_tokens == 1000
94
94
  assert agent.anthropic_client == mock_model_client
95
95
 
@@ -103,7 +103,7 @@ class TestClaudeAgent:
103
103
  validate_api_key=False, # Skip validation in tests
104
104
  )
105
105
 
106
- assert agent.model_name == "claude-3-opus-20240229"
106
+ assert agent.model_name == "Claude"
107
107
  assert agent.anthropic_client is not None
108
108
 
109
109
  @pytest.mark.asyncio
@@ -50,7 +50,7 @@ class TestOperatorAgent:
50
50
  validate_api_key=False, # Skip validation in tests
51
51
  )
52
52
 
53
- assert agent.model_name == "openai-gpt-4"
53
+ assert agent.model_name == "Operator"
54
54
  assert agent.model == "gpt-4"
55
55
  assert agent.openai_client == mock_model_client
56
56
 
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ from typing import TYPE_CHECKING
5
+
6
+ from hud.otel.context import (
7
+ _update_task_status_async,
8
+ get_current_task_run_id,
9
+ )
10
+
11
+ if TYPE_CHECKING:
12
+ from hud.datasets import Task
13
+
14
+
15
+ async def log_task_config_to_current_trace(task: Task) -> None:
16
+ with contextlib.suppress(Exception):
17
+ task_run_id = get_current_task_run_id()
18
+ if not task_run_id:
19
+ return
20
+
21
+ raw_config = task.model_dump()
22
+
23
+ await _update_task_status_async(
24
+ task_run_id,
25
+ "running",
26
+ task_id=task.id,
27
+ extra_metadata={"task_config": raw_config},
28
+ )
29
+
30
+
31
+ async def log_agent_metadata_to_status(
32
+ model_name: str | None = None, checkpoint_name: str | None = None
33
+ ) -> None:
34
+ """Attach agent metadata (model/checkpoint) to current trace status metadata."""
35
+ with contextlib.suppress(Exception):
36
+ task_run_id = get_current_task_run_id()
37
+ if not task_run_id or (not model_name and not checkpoint_name):
38
+ return
39
+
40
+ agent_meta = {}
41
+ if model_name is not None:
42
+ agent_meta["model_name"] = model_name
43
+ if checkpoint_name is not None:
44
+ agent_meta["checkpoint_name"] = checkpoint_name
45
+
46
+ await _update_task_status_async(
47
+ task_run_id,
48
+ "running",
49
+ extra_metadata={"agent": agent_meta},
50
+ )
@@ -382,6 +382,11 @@ def dev(
382
382
  "--watch",
383
383
  help="Additional directories to watch for changes (default: current directory)",
384
384
  ),
385
+ new: bool = typer.Option(
386
+ False,
387
+ "--new",
388
+ help="Show Cursor installation link for new server setup",
389
+ ),
385
390
  ) -> None:
386
391
  """🔥 Development mode - run MCP server with hot-reload.
387
392
 
@@ -422,6 +427,7 @@ def dev(
422
427
  watch,
423
428
  docker=docker,
424
429
  docker_args=docker_args,
430
+ new=new,
425
431
  )
426
432
 
427
433
 
@@ -740,7 +746,7 @@ def init(
740
746
  None,
741
747
  "--preset",
742
748
  "-p",
743
- help="Preset to use: blank, deep-research, browser. If omitted, you'll choose interactively.", # noqa: E501
749
+ help="Preset to use: blank, deep-research, browser, rubrics. If omitted, you'll choose interactively.", # noqa: E501
744
750
  ),
745
751
  directory: str = typer.Option(".", "--dir", "-d", help="Target directory"),
746
752
  force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing files"),
@@ -1079,6 +1085,51 @@ def rl(
1079
1085
  )
1080
1086
 
1081
1087
 
1088
+ @app.command()
1089
+ def convert(
1090
+ tasks_file: str = typer.Argument(
1091
+ ..., help="Path to tasks file (JSON/JSONL) to convert to remote MCP configuration"
1092
+ ),
1093
+ ) -> None:
1094
+ """Convert local MCP task configs to remote (mcp.hud.so) format.
1095
+
1096
+ This mirrors the implicit conversion flow used by 'hud rl' and writes a new
1097
+ remote_<name>.json next to the source file when needed.
1098
+ """
1099
+ from pathlib import Path
1100
+
1101
+ from hud.utils.hud_console import HUDConsole
1102
+
1103
+ hud_console = HUDConsole()
1104
+
1105
+ try:
1106
+ from .flows.tasks import convert_tasks_to_remote
1107
+
1108
+ result_path = convert_tasks_to_remote(tasks_file)
1109
+
1110
+ # If nothing changed, inform the user
1111
+ try:
1112
+ if Path(result_path).resolve() == Path(tasks_file).resolve():
1113
+ hud_console.success(
1114
+ "Tasks already reference remote MCP URLs. No conversion needed."
1115
+ )
1116
+ hud_console.hint("You can run them directly with: hud eval <tasks_file> --full")
1117
+ return
1118
+ except Exception as e:
1119
+ # Best effort; continue with success message
1120
+ hud_console.debug(f"Path comparison failed, continuing: {e}")
1121
+
1122
+ hud_console.success(f"Converted tasks written to: {result_path}")
1123
+ hud_console.hint(
1124
+ "You can now run remote flows: hud rl <converted_file> or hud eval <converted_file>"
1125
+ )
1126
+ except typer.Exit:
1127
+ raise
1128
+ except Exception as e:
1129
+ hud_console.error(f"Failed to convert tasks: {e}")
1130
+ raise typer.Exit(1) from e
1131
+
1132
+
1082
1133
  @app.command()
1083
1134
  def set(
1084
1135
  assignments: list[str] = typer.Argument( # type: ignore[arg-type] # noqa: B008