hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,356 @@
1
+ """Tests for EvalContext telemetry integration with mock backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from typing import Any
7
+ from unittest.mock import patch
8
+
9
+ import pytest
10
+
11
+ import hud
12
+ from hud.environment import Environment
13
+ from hud.eval import Task
14
+ from hud.telemetry.exporter import _pending_futures, _pending_spans
15
+
16
+
17
+ @pytest.fixture(autouse=True)
18
+ def clear_pending_state():
19
+ """Clear pending spans and futures before and after each test."""
20
+ _pending_spans.clear()
21
+ _pending_futures.clear()
22
+ yield
23
+ _pending_spans.clear()
24
+ _pending_futures.clear()
25
+
26
+
27
+ class TestEvalContextTelemetry:
28
+ """Tests for EvalContext telemetry integration."""
29
+
30
+ @pytest.mark.asyncio
31
+ async def test_call_tool_records_span(self):
32
+ """Test that call_tool records a span with correct format."""
33
+ uploaded_spans: list[dict[str, Any]] = []
34
+
35
+ def capture_upload(
36
+ task_run_id: str,
37
+ spans: list[dict[str, Any]],
38
+ telemetry_url: str,
39
+ api_key: str,
40
+ ) -> bool:
41
+ uploaded_spans.extend(spans)
42
+ return True
43
+
44
+ # Create environment with a simple tool
45
+ env = Environment("test-env")
46
+
47
+ @env.tool
48
+ async def greet(name: str) -> str:
49
+ """Say hello."""
50
+ return f"Hello, {name}!"
51
+
52
+ # Create task from environment
53
+ task = Task(env=env)
54
+
55
+ with (
56
+ patch("hud.settings.settings") as mock_settings,
57
+ patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
58
+ patch("hud.eval.context.make_request"), # Don't send eval enter/exit
59
+ ):
60
+ mock_settings.api_key = "test-key"
61
+ mock_settings.telemetry_enabled = True
62
+ mock_settings.hud_telemetry_url = "https://api.hud.ai"
63
+ mock_settings.hud_api_url = "https://api.hud.ai"
64
+
65
+ async with hud.eval(task) as ctx:
66
+ result = await ctx.call_tool("greet", name="World")
67
+ # call_tool returns MCPToolResult with formatted content
68
+ assert "Hello, World!" in str(result)
69
+ trace_id = ctx.trace_id
70
+
71
+ # Wait for thread pool
72
+ await asyncio.sleep(0.2)
73
+
74
+ # Verify span was recorded
75
+ assert len(uploaded_spans) >= 1
76
+ span = uploaded_spans[0]
77
+
78
+ # Check span structure
79
+ assert "name" in span
80
+ assert "trace_id" in span
81
+ assert "span_id" in span
82
+ assert "start_time" in span
83
+ assert "end_time" in span
84
+ assert "status_code" in span
85
+ assert "attributes" in span
86
+
87
+ # Check attributes
88
+ attrs = span["attributes"]
89
+ assert attrs["task_run_id"] == trace_id
90
+ assert attrs["category"] == "mcp"
91
+
92
+ @pytest.mark.asyncio
93
+ async def test_call_tool_records_error_span(self):
94
+ """Test that failed call_tool records error span."""
95
+ uploaded_spans: list[dict[str, Any]] = []
96
+
97
+ def capture_upload(
98
+ task_run_id: str,
99
+ spans: list[dict[str, Any]],
100
+ telemetry_url: str,
101
+ api_key: str,
102
+ ) -> bool:
103
+ uploaded_spans.extend(spans)
104
+ return True
105
+
106
+ env = Environment("test-env")
107
+
108
+ @env.tool
109
+ async def failing_tool() -> str:
110
+ """Always fails."""
111
+ raise ValueError("Tool error")
112
+
113
+ task = Task(env=env)
114
+
115
+ with (
116
+ patch("hud.settings.settings") as mock_settings,
117
+ patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
118
+ patch("hud.eval.context.make_request"),
119
+ ):
120
+ mock_settings.api_key = "test-key"
121
+ mock_settings.telemetry_enabled = True
122
+ mock_settings.hud_telemetry_url = "https://api.hud.ai"
123
+ mock_settings.hud_api_url = "https://api.hud.ai"
124
+
125
+ async with hud.eval(task) as ctx:
126
+ # Tool errors are wrapped in ToolError
127
+ with pytest.raises(Exception, match="Tool error"):
128
+ await ctx.call_tool("failing_tool")
129
+
130
+ await asyncio.sleep(0.2)
131
+
132
+ # Should have recorded span with ERROR status
133
+ assert len(uploaded_spans) >= 1
134
+ span = uploaded_spans[0]
135
+ assert span["status_code"] == "ERROR"
136
+ # Error message contains the original error
137
+ assert "Tool error" in (span.get("status_message") or "")
138
+
139
+ @pytest.mark.asyncio
140
+ async def test_multiple_call_tools_record_spans(self):
141
+ """Test that multiple call_tool calls each record a span."""
142
+ uploaded_spans: list[dict[str, Any]] = []
143
+
144
+ def capture_upload(
145
+ task_run_id: str,
146
+ spans: list[dict[str, Any]],
147
+ telemetry_url: str,
148
+ api_key: str,
149
+ ) -> bool:
150
+ uploaded_spans.extend(spans)
151
+ return True
152
+
153
+ env = Environment("test-env")
154
+
155
+ @env.tool
156
+ async def add(a: int, b: int) -> int:
157
+ """Add two numbers."""
158
+ return a + b
159
+
160
+ @env.tool
161
+ async def multiply(a: int, b: int) -> int:
162
+ """Multiply two numbers."""
163
+ return a * b
164
+
165
+ task = Task(env=env)
166
+
167
+ with (
168
+ patch("hud.settings.settings") as mock_settings,
169
+ patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
170
+ patch("hud.eval.context.make_request"),
171
+ ):
172
+ mock_settings.api_key = "test-key"
173
+ mock_settings.telemetry_enabled = True
174
+ mock_settings.hud_telemetry_url = "https://api.hud.ai"
175
+ mock_settings.hud_api_url = "https://api.hud.ai"
176
+
177
+ async with hud.eval(task) as ctx:
178
+ r1 = await ctx.call_tool("add", a=2, b=3)
179
+ r2 = await ctx.call_tool("multiply", a=4, b=5)
180
+ # Results are MCPToolResult objects
181
+ assert "5" in str(r1)
182
+ assert "20" in str(r2)
183
+
184
+ await asyncio.sleep(0.2)
185
+
186
+ # Should have 2 spans
187
+ assert len(uploaded_spans) >= 2
188
+
189
+ @pytest.mark.asyncio
190
+ async def test_flush_called_on_context_exit(self):
191
+ """Test that flush is called when context exits."""
192
+ env = Environment("test-env")
193
+
194
+ @env.tool
195
+ async def simple_tool() -> str:
196
+ return "done"
197
+
198
+ task = Task(env=env)
199
+
200
+ with (
201
+ patch("hud.eval.context.flush") as mock_flush,
202
+ patch("hud.settings.settings") as mock_settings,
203
+ patch("hud.eval.context.make_request"),
204
+ ):
205
+ mock_settings.api_key = "test-key"
206
+ mock_settings.telemetry_enabled = True
207
+ mock_settings.hud_api_url = "https://api.hud.ai"
208
+
209
+ async with hud.eval(task) as ctx:
210
+ await ctx.call_tool("simple_tool")
211
+ trace_id = ctx.trace_id
212
+
213
+ # Verify flush was called with the trace_id
214
+ mock_flush.assert_called_once_with(trace_id)
215
+
216
+ @pytest.mark.asyncio
217
+ async def test_telemetry_disabled_no_upload(self):
218
+ """Test that no upload happens when telemetry is disabled."""
219
+ upload_called = False
220
+
221
+ def should_not_be_called(*args: Any, **kwargs: Any) -> bool:
222
+ nonlocal upload_called
223
+ upload_called = True
224
+ return True
225
+
226
+ env = Environment("test-env")
227
+
228
+ @env.tool
229
+ async def test_tool() -> str:
230
+ return "ok"
231
+
232
+ task = Task(env=env)
233
+
234
+ with (
235
+ patch("hud.settings.settings") as mock_settings,
236
+ patch("hud.telemetry.exporter._do_upload", side_effect=should_not_be_called),
237
+ patch("hud.eval.context.make_request"),
238
+ ):
239
+ mock_settings.api_key = "test-key"
240
+ mock_settings.telemetry_enabled = False # Disabled!
241
+ mock_settings.hud_telemetry_url = "https://api.hud.ai"
242
+ mock_settings.hud_api_url = "https://api.hud.ai"
243
+
244
+ async with hud.eval(task) as ctx:
245
+ await ctx.call_tool("test_tool")
246
+
247
+ await asyncio.sleep(0.1)
248
+
249
+ assert upload_called is False
250
+
251
+
252
+ class TestSpanFormat:
253
+ """Tests for the format of recorded spans."""
254
+
255
+ @pytest.mark.asyncio
256
+ async def test_span_has_required_fields(self):
257
+ """Test that spans have all required HudSpan fields."""
258
+ uploaded_spans: list[dict[str, Any]] = []
259
+
260
+ def capture_upload(
261
+ task_run_id: str,
262
+ spans: list[dict[str, Any]],
263
+ telemetry_url: str,
264
+ api_key: str,
265
+ ) -> bool:
266
+ uploaded_spans.extend(spans)
267
+ return True
268
+
269
+ env = Environment("test-env")
270
+
271
+ @env.tool
272
+ async def echo(message: str) -> str:
273
+ return message
274
+
275
+ task = Task(env=env)
276
+
277
+ with (
278
+ patch("hud.settings.settings") as mock_settings,
279
+ patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
280
+ patch("hud.eval.context.make_request"),
281
+ ):
282
+ mock_settings.api_key = "test-key"
283
+ mock_settings.telemetry_enabled = True
284
+ mock_settings.hud_telemetry_url = "https://api.hud.ai"
285
+ mock_settings.hud_api_url = "https://api.hud.ai"
286
+
287
+ async with hud.eval(task) as ctx:
288
+ await ctx.call_tool("echo", message="test")
289
+
290
+ await asyncio.sleep(0.2)
291
+
292
+ assert len(uploaded_spans) >= 1
293
+ span = uploaded_spans[0]
294
+
295
+ # Required fields from HudSpan
296
+ assert "name" in span
297
+ assert "trace_id" in span
298
+ assert len(span["trace_id"]) == 32 # 32-char hex
299
+ assert "span_id" in span
300
+ assert len(span["span_id"]) == 16 # 16-char hex
301
+ assert "start_time" in span
302
+ assert "end_time" in span
303
+ assert "status_code" in span
304
+ assert span["status_code"] in ("OK", "ERROR", "UNSET")
305
+
306
+ # Attributes
307
+ assert "attributes" in span
308
+ attrs = span["attributes"]
309
+ assert "task_run_id" in attrs
310
+ assert "category" in attrs
311
+
312
+ @pytest.mark.asyncio
313
+ async def test_span_timestamps_are_iso(self):
314
+ """Test that span timestamps are in ISO format."""
315
+ uploaded_spans: list[dict[str, Any]] = []
316
+
317
+ def capture_upload(
318
+ task_run_id: str,
319
+ spans: list[dict[str, Any]],
320
+ telemetry_url: str,
321
+ api_key: str,
322
+ ) -> bool:
323
+ uploaded_spans.extend(spans)
324
+ return True
325
+
326
+ env = Environment("test-env")
327
+
328
+ @env.tool
329
+ async def noop() -> None:
330
+ pass
331
+
332
+ task = Task(env=env)
333
+
334
+ with (
335
+ patch("hud.settings.settings") as mock_settings,
336
+ patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
337
+ patch("hud.eval.context.make_request"),
338
+ ):
339
+ mock_settings.api_key = "test-key"
340
+ mock_settings.telemetry_enabled = True
341
+ mock_settings.hud_telemetry_url = "https://api.hud.ai"
342
+ mock_settings.hud_api_url = "https://api.hud.ai"
343
+
344
+ async with hud.eval(task) as ctx:
345
+ await ctx.call_tool("noop")
346
+
347
+ await asyncio.sleep(0.2)
348
+
349
+ span = uploaded_spans[0]
350
+
351
+ # ISO format: YYYY-MM-DDTHH:MM:SS.ssssssZ
352
+ import re
353
+
354
+ iso_pattern = r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}"
355
+ assert re.match(iso_pattern, span["start_time"])
356
+ assert re.match(iso_pattern, span["end_time"])
@@ -0,0 +1,258 @@
1
+ """Tests for telemetry exporter with mock backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from typing import Any
7
+ from unittest.mock import patch
8
+
9
+ import pytest
10
+
11
+ from hud.telemetry.exporter import (
12
+ _do_upload,
13
+ _pending_futures,
14
+ _pending_spans,
15
+ flush,
16
+ queue_span,
17
+ shutdown,
18
+ )
19
+
20
+
21
+ @pytest.fixture(autouse=True)
22
+ def clear_pending_state():
23
+ """Clear pending spans and futures before and after each test."""
24
+ _pending_spans.clear()
25
+ _pending_futures.clear()
26
+ yield
27
+ _pending_spans.clear()
28
+ _pending_futures.clear()
29
+
30
+
31
+ class TestDoUpload:
32
+ """Tests for _do_upload function."""
33
+
34
+ def test_upload_success(self):
35
+ """Test successful upload."""
36
+ with patch("hud.telemetry.exporter.make_request_sync") as mock_request:
37
+ result = _do_upload(
38
+ task_run_id="test-task-123",
39
+ spans=[{"name": "test.span", "attributes": {"task_run_id": "test-task-123"}}],
40
+ telemetry_url="https://api.hud.ai",
41
+ api_key="test-key",
42
+ )
43
+
44
+ assert result is True
45
+ mock_request.assert_called_once()
46
+ call_kwargs = mock_request.call_args.kwargs
47
+ assert call_kwargs["method"] == "POST"
48
+ assert "test-task-123" in call_kwargs["url"]
49
+ assert call_kwargs["api_key"] == "test-key"
50
+ assert "telemetry" in call_kwargs["json"]
51
+
52
+ def test_upload_failure(self):
53
+ """Test upload failure handling."""
54
+ with patch("hud.telemetry.exporter.make_request_sync") as mock_request:
55
+ mock_request.side_effect = Exception("Network error")
56
+
57
+ result = _do_upload(
58
+ task_run_id="test-task-123",
59
+ spans=[{"name": "test.span"}],
60
+ telemetry_url="https://api.hud.ai",
61
+ api_key="test-key",
62
+ )
63
+
64
+ assert result is False
65
+
66
+
67
+ class TestQueueSpan:
68
+ """Tests for queue_span function."""
69
+
70
+ def test_queue_span_without_api_key(self):
71
+ """Test that spans are not queued without API key."""
72
+ with patch("hud.settings.settings") as mock_settings:
73
+ mock_settings.api_key = None
74
+ mock_settings.telemetry_enabled = True
75
+
76
+ queue_span({"name": "test", "attributes": {"task_run_id": "123"}})
77
+
78
+ assert len(_pending_spans) == 0
79
+
80
+ def test_queue_span_without_telemetry_enabled(self):
81
+ """Test that spans are not queued when telemetry disabled."""
82
+ with patch("hud.settings.settings") as mock_settings:
83
+ mock_settings.api_key = "test-key"
84
+ mock_settings.telemetry_enabled = False
85
+
86
+ queue_span({"name": "test", "attributes": {"task_run_id": "123"}})
87
+
88
+ assert len(_pending_spans) == 0
89
+
90
+ def test_queue_span_without_task_run_id(self):
91
+ """Test that spans without task_run_id are ignored."""
92
+ with patch("hud.settings.settings") as mock_settings:
93
+ mock_settings.api_key = "test-key"
94
+ mock_settings.telemetry_enabled = True
95
+
96
+ queue_span({"name": "test", "attributes": {}})
97
+
98
+ assert len(_pending_spans) == 0
99
+
100
+ def test_queue_span_adds_to_pending(self):
101
+ """Test that spans are added to pending list."""
102
+ # Don't mock _do_upload so spans stay in pending
103
+ with patch("hud.settings.settings") as mock_settings:
104
+ mock_settings.api_key = "test-key"
105
+ mock_settings.telemetry_enabled = True
106
+ mock_settings.hud_telemetry_url = "https://api.hud.ai"
107
+
108
+ # Use a sync context (no event loop) so upload happens sync
109
+ # But we'll make it fail so span stays in pending
110
+ with patch("hud.telemetry.exporter._do_upload", return_value=False):
111
+ span = {"name": "test", "attributes": {"task_run_id": "task-123"}}
112
+ queue_span(span)
113
+
114
+ # Span should be in pending (upload failed so not removed)
115
+ assert "task-123" in _pending_spans
116
+ assert span in _pending_spans["task-123"]
117
+
118
+ @pytest.mark.asyncio
119
+ async def test_queue_span_uploads_async(self):
120
+ """Test that spans are uploaded via thread pool in async context."""
121
+ uploaded_spans: list[dict[str, Any]] = []
122
+
123
+ def mock_upload(
124
+ task_run_id: str,
125
+ spans: list[dict[str, Any]],
126
+ telemetry_url: str,
127
+ api_key: str,
128
+ ) -> bool:
129
+ uploaded_spans.extend(spans)
130
+ return True
131
+
132
+ with (
133
+ patch("hud.settings.settings") as mock_settings,
134
+ patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
135
+ ):
136
+ mock_settings.api_key = "test-key"
137
+ mock_settings.telemetry_enabled = True
138
+ mock_settings.hud_telemetry_url = "https://api.hud.ai"
139
+
140
+ span = {"name": "test.async", "attributes": {"task_run_id": "async-task"}}
141
+ queue_span(span)
142
+
143
+ # Wait for thread pool to complete
144
+ await asyncio.sleep(0.1)
145
+
146
+ assert len(uploaded_spans) == 1
147
+ assert uploaded_spans[0]["name"] == "test.async"
148
+
149
+
150
+ class TestFlush:
151
+ """Tests for flush function."""
152
+
153
+ def test_flush_specific_task(self):
154
+ """Test flushing spans for specific task."""
155
+ uploaded: list[tuple[str, list[dict[str, Any]]]] = []
156
+
157
+ def mock_upload(
158
+ task_run_id: str,
159
+ spans: list[dict[str, Any]],
160
+ telemetry_url: str,
161
+ api_key: str,
162
+ ) -> bool:
163
+ uploaded.append((task_run_id, spans))
164
+ return True
165
+
166
+ with (
167
+ patch("hud.settings.settings") as mock_settings,
168
+ patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
169
+ ):
170
+ mock_settings.api_key = "test-key"
171
+ mock_settings.telemetry_enabled = True
172
+ mock_settings.hud_telemetry_url = "https://api.hud.ai"
173
+
174
+ # Add spans for two tasks
175
+ _pending_spans["task-1"].append({"name": "span1"})
176
+ _pending_spans["task-2"].append({"name": "span2"})
177
+
178
+ # Flush only task-1
179
+ flush("task-1")
180
+
181
+ assert len(uploaded) == 1
182
+ assert uploaded[0][0] == "task-1"
183
+ assert "task-1" not in _pending_spans
184
+ assert "task-2" in _pending_spans
185
+
186
+ def test_flush_all_tasks(self):
187
+ """Test flushing all pending spans."""
188
+ uploaded: list[tuple[str, list[dict[str, Any]]]] = []
189
+
190
+ def mock_upload(
191
+ task_run_id: str,
192
+ spans: list[dict[str, Any]],
193
+ telemetry_url: str,
194
+ api_key: str,
195
+ ) -> bool:
196
+ uploaded.append((task_run_id, spans))
197
+ return True
198
+
199
+ with (
200
+ patch("hud.settings.settings") as mock_settings,
201
+ patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
202
+ ):
203
+ mock_settings.api_key = "test-key"
204
+ mock_settings.telemetry_enabled = True
205
+ mock_settings.hud_telemetry_url = "https://api.hud.ai"
206
+
207
+ _pending_spans["task-1"].append({"name": "span1"})
208
+ _pending_spans["task-2"].append({"name": "span2"})
209
+
210
+ flush()
211
+
212
+ assert len(uploaded) == 2
213
+ assert len(_pending_spans) == 0
214
+
215
+ def test_flush_clears_without_api_key(self):
216
+ """Test that flush clears spans when no API key."""
217
+ with patch("hud.settings.settings") as mock_settings:
218
+ mock_settings.api_key = None
219
+ mock_settings.telemetry_enabled = True
220
+
221
+ _pending_spans["task-1"].append({"name": "span1"})
222
+
223
+ flush()
224
+
225
+ assert len(_pending_spans) == 0
226
+
227
+
228
+ class TestShutdown:
229
+ """Tests for shutdown function."""
230
+
231
+ def test_shutdown_flushes_pending(self):
232
+ """Test that shutdown flushes pending spans."""
233
+ uploaded: list[str] = []
234
+
235
+ def mock_upload(
236
+ task_run_id: str,
237
+ spans: list[dict[str, Any]],
238
+ telemetry_url: str,
239
+ api_key: str,
240
+ ) -> bool:
241
+ uploaded.append(task_run_id)
242
+ return True
243
+
244
+ with (
245
+ patch("hud.settings.settings") as mock_settings,
246
+ patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
247
+ patch("hud.telemetry.exporter._get_api_key", return_value="test-key"),
248
+ ):
249
+ mock_settings.api_key = "test-key"
250
+ mock_settings.telemetry_enabled = True
251
+ mock_settings.hud_telemetry_url = "https://api.hud.ai"
252
+
253
+ _pending_spans["shutdown-task"].append({"name": "final-span"})
254
+
255
+ result = shutdown(timeout=1.0)
256
+
257
+ assert result is True
258
+ assert "shutdown-task" in uploaded