hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -1,197 +0,0 @@
1
- """Tests for OpenTelemetry processors."""
2
-
3
- from __future__ import annotations
4
-
5
- from unittest.mock import MagicMock, patch
6
-
7
- from hud.otel.processors import HudEnrichmentProcessor
8
-
9
-
10
- class TestHudEnrichmentProcessor:
11
- """Test HudEnrichmentProcessor."""
12
-
13
- def test_on_start_with_run_id(self):
14
- """Test on_start with current task run ID."""
15
-
16
- processor = HudEnrichmentProcessor()
17
-
18
- # Mock span
19
- span = MagicMock()
20
- span.set_attribute = MagicMock()
21
- span.is_recording.return_value = True
22
-
23
- # Mock baggage to return run ID
24
- parent_context = {}
25
- with patch("hud.otel.processors.baggage.get_baggage") as mock_get_baggage:
26
- # Return run ID for task_run_id, None for job_id
27
- mock_get_baggage.side_effect = (
28
- lambda key, context: "test-run-123" if key == "hud.task_run_id" else None
29
- )
30
- processor.on_start(span, parent_context)
31
-
32
- # Verify attribute was set
33
- span.set_attribute.assert_called_with("hud.task_run_id", "test-run-123")
34
-
35
- def test_on_start_no_run_id(self):
36
- """Test on_start without current task run ID."""
37
-
38
- processor = HudEnrichmentProcessor()
39
-
40
- # Mock span
41
- span = MagicMock()
42
- span.set_attribute = MagicMock()
43
- span.is_recording.return_value = True
44
- span.name = "test_span"
45
-
46
- # Set up attributes to return None (not matching any step type)
47
- span.attributes = {}
48
-
49
- # Mock baggage to return None
50
- parent_context = {}
51
- with patch("hud.otel.processors.baggage.get_baggage", return_value=None):
52
- processor.on_start(span, parent_context)
53
-
54
- # Verify only step count attributes were set (no run_id or job_id)
55
- calls = span.set_attribute.call_args_list
56
- set_attrs = {call[0][0] for call in calls}
57
-
58
- # Should have step counts but not run_id/job_id
59
- assert "hud.task_run_id" not in set_attrs
60
- assert "hud.job_id" not in set_attrs
61
- assert "hud.base_mcp_steps" in set_attrs
62
- assert "hud.mcp_tool_steps" in set_attrs
63
- assert "hud.agent_steps" in set_attrs
64
-
65
- def test_on_end(self):
66
- """Test on_end does nothing."""
67
-
68
- processor = HudEnrichmentProcessor()
69
- span = MagicMock()
70
-
71
- # Should not raise
72
- processor.on_end(span)
73
-
74
- def test_shutdown(self):
75
- """Test shutdown does nothing."""
76
-
77
- processor = HudEnrichmentProcessor()
78
-
79
- # Should not raise
80
- processor.shutdown()
81
-
82
- def test_force_flush(self):
83
- """Test force_flush returns True."""
84
-
85
- processor = HudEnrichmentProcessor()
86
-
87
- # Should return True
88
- result = processor.force_flush()
89
- assert result is True
90
-
91
- def test_on_start_with_job_id(self):
92
- """Test on_start with job ID in baggage."""
93
-
94
- processor = HudEnrichmentProcessor()
95
-
96
- # Mock span
97
- span = MagicMock()
98
- span.set_attribute = MagicMock()
99
- span.is_recording.return_value = True
100
-
101
- # Mock baggage with job ID
102
- parent_context = {}
103
- with patch("hud.otel.processors.baggage.get_baggage") as mock_get_baggage:
104
- # Return None for task_run_id, job-123 for job_id
105
- mock_get_baggage.side_effect = (
106
- lambda key, context: "job-123" if key == "hud.job_id" else None
107
- )
108
- processor.on_start(span, parent_context)
109
-
110
- # Verify job ID attribute was set
111
- span.set_attribute.assert_called_with("hud.job_id", "job-123")
112
-
113
- def test_on_start_exception_handling(self):
114
- """Test on_start handles exceptions gracefully."""
115
-
116
- processor = HudEnrichmentProcessor()
117
-
118
- # Mock span that raises exception
119
- span = MagicMock()
120
- span.is_recording.side_effect = Exception("Test error")
121
-
122
- # Should not raise
123
- processor.on_start(span, parent_context=None)
124
-
125
- def test_on_start_exception_handling_extended(self):
126
- """Test that exceptions in on_start are caught and logged."""
127
- from hud.otel.processors import HudEnrichmentProcessor
128
-
129
- processor = HudEnrichmentProcessor()
130
-
131
- # Create a mock span that raises when setting attributes
132
- mock_span = MagicMock()
133
- mock_span.is_recording.return_value = True
134
- mock_span.set_attribute.side_effect = RuntimeError("Attribute error")
135
-
136
- parent_context = {}
137
-
138
- # Patch logger and baggage to force an exception when setting attribute
139
- with (
140
- patch("hud.otel.processors.logger") as mock_logger,
141
- patch("hud.otel.processors.baggage.get_baggage", return_value="test-id"),
142
- ):
143
- # Should not raise, exception should be caught
144
- processor.on_start(mock_span, parent_context)
145
-
146
- # Verify logger.debug was called with the exception
147
- mock_logger.debug.assert_called_once()
148
- args = mock_logger.debug.call_args[0]
149
- assert "HudEnrichmentProcessor.on_start error" in args[0]
150
- assert "Attribute error" in str(args[1])
151
-
152
- def test_on_start_with_baggage_get_exception(self):
153
- """Test exception handling when baggage.get_baggage fails for task_run_id."""
154
- processor = HudEnrichmentProcessor()
155
-
156
- mock_span = MagicMock()
157
- mock_span.is_recording.return_value = True
158
-
159
- parent_context = {}
160
-
161
- # Make baggage.get_baggage raise an exception for task_run_id
162
- with (
163
- patch(
164
- "hud.otel.processors.baggage.get_baggage",
165
- side_effect=ValueError("Context error"),
166
- ),
167
- patch("hud.otel.processors.logger") as mock_logger,
168
- ):
169
- # Should not raise
170
- processor.on_start(mock_span, parent_context)
171
-
172
- # Verify logger.debug was called
173
- mock_logger.debug.assert_called_once()
174
- args = mock_logger.debug.call_args[0]
175
- assert "Context error" in str(args[1])
176
-
177
- def test_on_start_with_baggage_exception(self):
178
- """Test exception handling when baggage.get_baggage fails."""
179
- processor = HudEnrichmentProcessor()
180
-
181
- mock_span = MagicMock()
182
- mock_span.is_recording.return_value = True
183
-
184
- parent_context = {}
185
-
186
- # Make baggage.get_baggage raise an exception
187
- with (
188
- patch("hud.otel.processors.baggage.get_baggage", side_effect=KeyError("Baggage error")),
189
- patch("hud.otel.processors.logger") as mock_logger,
190
- ):
191
- # Should not raise
192
- processor.on_start(mock_span, parent_context)
193
-
194
- # Verify logger.debug was called
195
- mock_logger.debug.assert_called_once()
196
- args = mock_logger.debug.call_args[0]
197
- assert "Baggage error" in str(args[1])
hud/rl/README.md DELETED
@@ -1,30 +0,0 @@
1
- We suggest running hud rl (or with the --local flag) for optimal hyperparameters and native HuggingFace running.
2
-
3
- However, to run this independently, sping up an instance with at least 2 GPUs and run:
4
- ```bash
5
- sudo apt-get update -y && sudo apt-get install -y cuda-toolkit-12-6
6
- uv pip install -e .[rl]
7
- uv pip install ninja
8
- uv pip install flash-attn --no-build-isolation
9
- ```
10
-
11
- Launch a vllm server with:
12
- ```bash
13
- export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
14
- export TOKENIZERS_PARALLELISM=false
15
- export VLLM_LOGGING_LEVEL=INFO
16
- export CUDA_VISIBLE_DEVICES=7 # Set this to your last GPU
17
-
18
- uv run vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
19
- --api-key token-abc123 --host 0.0.0.0 --port 8000 --tensor-parallel-size 1 --trust-remote-code \
20
- --max-model-len 16384 --enable-lora --max-lora-rank 64 --max-cpu-loras 4 --enable-auto-tool-choice \
21
- --tool-call-parser hermes --disable-log-requests --dtype auto
22
- ```
23
-
24
- And training with (replace 2 with your spare GPUs):
25
- ```bash
26
- hud get hud-evals/2048-basic
27
- torchrun --nproc-per-node 2 -m hud.rl.train --tasks 2048-basic.json --verbose
28
- ```
29
-
30
- Add a `--config path/to/config.json` flag to run a specific configuration (or change the defaults in config.py)
hud/rl/__init__.py DELETED
@@ -1 +0,0 @@
1
- """RL module for HUD."""
hud/rl/actor.py DELETED
@@ -1,176 +0,0 @@
1
- """Actor for episode collection using vLLM and HUD."""
2
-
3
- from __future__ import annotations
4
-
5
- import asyncio
6
- import logging
7
-
8
- import httpx
9
- from openai import AsyncOpenAI
10
-
11
- import hud
12
- from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
13
- from hud.clients.utils.retry_transport import create_retry_httpx_client
14
- from hud.types import Task, Trace
15
- from hud.utils.hud_console import HUDConsole
16
-
17
- from .config import Config
18
-
19
- logger = logging.getLogger(__name__)
20
- hud_console = HUDConsole(logger)
21
-
22
-
23
- class Actor:
24
- """Collects episodes using vLLM-served models via HUD agents."""
25
-
26
- def __init__(self, config: Config) -> None:
27
- self.config = config
28
- self.actor_config = config.actor
29
- self.current_adapter = config.model.base_model
30
-
31
- # Setup OpenAI client for vLLM
32
- base_url = self.actor_config.vllm_base_url.replace("localhost", "127.0.0.1")
33
- self.openai_client = self._create_openai_client(base_url)
34
-
35
- def _create_openai_client(self, base_url: str) -> AsyncOpenAI:
36
- """Create OpenAI client with optimized settings for vLLM."""
37
- # Match connection limits to parallel_episodes to avoid bottlenecks
38
- # Use shorter per-request timeout and keep retries modest to avoid long blocking
39
- http_client = create_retry_httpx_client(
40
- timeout=httpx.Timeout(30.0),
41
- )
42
- return AsyncOpenAI(
43
- base_url=base_url,
44
- api_key=self.actor_config.vllm_api_key,
45
- http_client=http_client,
46
- max_retries=2,
47
- )
48
-
49
- def create_agent(self) -> GenericOpenAIChatAgent:
50
- """Create an agent with the current adapter."""
51
- return GenericOpenAIChatAgent(
52
- openai_client=self.openai_client,
53
- model_name=self.current_adapter,
54
- allowed_tools=self.actor_config.allowed_tools,
55
- append_setup_output=False,
56
- system_prompt=self.actor_config.system_prompt,
57
- verbose=self.config.verbose,
58
- completion_kwargs={
59
- "temperature": self.actor_config.temperature,
60
- "max_tokens": self.actor_config.max_new_tokens,
61
- "tool_choice": "required" if self.actor_config.force_tool_choice else "auto",
62
- },
63
- )
64
-
65
- def update_adapter(self, adapter_name: str) -> None:
66
- """Update the current adapter being used."""
67
- self.current_adapter = adapter_name
68
- hud_console.info(f"[Actor] Using adapter: {adapter_name}")
69
-
70
- async def run_tasks(self, tasks: list[Task], job_id: str) -> list[Trace]:
71
- """Run tasks and collect traces."""
72
- traces = []
73
-
74
- # Process tasks in batches respecting max_parallel_episodes limit
75
- for batch_start in range(0, len(tasks), self.actor_config.max_parallel_episodes):
76
- batch_end = min(batch_start + self.actor_config.max_parallel_episodes, len(tasks))
77
- batch = tasks[batch_start:batch_end]
78
-
79
- # Run batch in parallel with per-episode timeout protection
80
- async def run_with_timeout(t: Task) -> Trace:
81
- try:
82
- return await asyncio.wait_for(
83
- self._run_task(t, job_id),
84
- timeout=self.actor_config.episode_timeout_sec,
85
- )
86
- except TimeoutError:
87
- hud_console.warning_log(f"Episode timed out for task {t.id}")
88
- # Attach task so buffer grouping has key
89
- return Trace(isError=True, content="Episode timeout", task=t)
90
-
91
- results = await asyncio.gather(
92
- *[run_with_timeout(t) for t in batch],
93
- return_exceptions=True,
94
- )
95
-
96
- # Normalize exceptions to error traces and ensure task is attached
97
- for t, res in zip(batch, results, strict=False):
98
- if isinstance(res, Exception):
99
- hud_console.warning_log(f"Episode error: {res}")
100
- traces.append(Trace(isError=True, content=str(res), task=t))
101
- else:
102
- traces.append(res)
103
-
104
- return traces
105
-
106
- async def _run_task(self, task: Task, job_id: str) -> Trace:
107
- """Run a single task."""
108
- agent = self.create_agent()
109
-
110
- # Run the task
111
- try:
112
- with hud.trace(f"Training | {task.prompt}", job_id=job_id):
113
- result = await agent.run(task, max_steps=self.actor_config.max_steps_per_episode)
114
-
115
- except Exception:
116
- logger.info("GOT EXCEPTION")
117
- # Preserve task on exception for grouping
118
- return Trace(isError=True, task=task)
119
-
120
- result.info["tool_spec"] = agent.get_tool_schemas()
121
-
122
- return result
123
-
124
-
125
- if __name__ == "__main__":
126
- from hud.types import Task
127
-
128
- async def test_actor() -> None:
129
- """Test the actor with a single 2048 task using local hud-browser image."""
130
- config = Config()
131
- config.actor.max_parallel_episodes = 1
132
- config.actor.max_steps_per_episode = 6
133
- config.actor.episode_timeout_sec = 120
134
- config.verbose = True
135
-
136
- # Create test task with local hud-browser image
137
- task_data = {
138
- "id": "test_2048_128",
139
- "prompt": "Play the browser-based 2048 game and try to reach the 128 tile. Start by taking a screenshot, then make strategic moves using arrow keys.", # noqa: E501
140
- "mcp_config": {
141
- "local": {
142
- "command": "sh",
143
- "args": [
144
- "-c",
145
- "docker run --rm --platform linux/amd64 -i hud-browser:latest 2>/dev/null",
146
- ],
147
- }
148
- },
149
- "setup_tool": {"name": "launch_app", "arguments": {"app_name": "2048"}},
150
- "evaluate_tool": {
151
- "name": "evaluate",
152
- "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}},
153
- },
154
- "system_prompt": "You are an expert 2048 game player. Use arrow keys to reach the target tile. First take a screenshot, then make strategic moves.", # noqa: E501
155
- }
156
-
157
- task = Task(**task_data)
158
- actor = Actor(config)
159
-
160
- logger.info("Testing actor with task: %s", task.id)
161
- logger.info("Model: %s", config.model.base_model)
162
- logger.info("VLLM: %s", config.actor.vllm_base_url)
163
-
164
- traces = await actor.run_tasks([task], job_id="test_2048")
165
-
166
- for trace in traces:
167
- if trace.isError:
168
- logger.info("Error: %s", trace.content)
169
- else:
170
- logger.info("Success!")
171
- logger.info("Trace info: %s", trace.info if hasattr(trace, "info") else "No info")
172
- # Check for evaluation in the trace info
173
- if hasattr(trace, "info") and "evaluation" in trace.info:
174
- logger.info(" Evaluation: %s", trace.info["evaluation"])
175
-
176
- asyncio.run(test_actor())