hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"""Tests for EvalContext telemetry integration with mock backend."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import Any
|
|
7
|
+
from unittest.mock import patch
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
|
|
11
|
+
import hud
|
|
12
|
+
from hud.environment import Environment
|
|
13
|
+
from hud.eval import Task
|
|
14
|
+
from hud.telemetry.exporter import _pending_futures, _pending_spans
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.fixture(autouse=True)
|
|
18
|
+
def clear_pending_state():
|
|
19
|
+
"""Clear pending spans and futures before and after each test."""
|
|
20
|
+
_pending_spans.clear()
|
|
21
|
+
_pending_futures.clear()
|
|
22
|
+
yield
|
|
23
|
+
_pending_spans.clear()
|
|
24
|
+
_pending_futures.clear()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TestEvalContextTelemetry:
|
|
28
|
+
"""Tests for EvalContext telemetry integration."""
|
|
29
|
+
|
|
30
|
+
@pytest.mark.asyncio
|
|
31
|
+
async def test_call_tool_records_span(self):
|
|
32
|
+
"""Test that call_tool records a span with correct format."""
|
|
33
|
+
uploaded_spans: list[dict[str, Any]] = []
|
|
34
|
+
|
|
35
|
+
def capture_upload(
|
|
36
|
+
task_run_id: str,
|
|
37
|
+
spans: list[dict[str, Any]],
|
|
38
|
+
telemetry_url: str,
|
|
39
|
+
api_key: str,
|
|
40
|
+
) -> bool:
|
|
41
|
+
uploaded_spans.extend(spans)
|
|
42
|
+
return True
|
|
43
|
+
|
|
44
|
+
# Create environment with a simple tool
|
|
45
|
+
env = Environment("test-env")
|
|
46
|
+
|
|
47
|
+
@env.tool
|
|
48
|
+
async def greet(name: str) -> str:
|
|
49
|
+
"""Say hello."""
|
|
50
|
+
return f"Hello, {name}!"
|
|
51
|
+
|
|
52
|
+
# Create task from environment
|
|
53
|
+
task = Task(env=env)
|
|
54
|
+
|
|
55
|
+
with (
|
|
56
|
+
patch("hud.settings.settings") as mock_settings,
|
|
57
|
+
patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
|
|
58
|
+
patch("hud.eval.context.make_request"), # Don't send eval enter/exit
|
|
59
|
+
):
|
|
60
|
+
mock_settings.api_key = "test-key"
|
|
61
|
+
mock_settings.telemetry_enabled = True
|
|
62
|
+
mock_settings.hud_telemetry_url = "https://api.hud.ai"
|
|
63
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
64
|
+
|
|
65
|
+
async with hud.eval(task) as ctx:
|
|
66
|
+
result = await ctx.call_tool("greet", name="World")
|
|
67
|
+
# call_tool returns MCPToolResult with formatted content
|
|
68
|
+
assert "Hello, World!" in str(result)
|
|
69
|
+
trace_id = ctx.trace_id
|
|
70
|
+
|
|
71
|
+
# Wait for thread pool
|
|
72
|
+
await asyncio.sleep(0.2)
|
|
73
|
+
|
|
74
|
+
# Verify span was recorded
|
|
75
|
+
assert len(uploaded_spans) >= 1
|
|
76
|
+
span = uploaded_spans[0]
|
|
77
|
+
|
|
78
|
+
# Check span structure
|
|
79
|
+
assert "name" in span
|
|
80
|
+
assert "trace_id" in span
|
|
81
|
+
assert "span_id" in span
|
|
82
|
+
assert "start_time" in span
|
|
83
|
+
assert "end_time" in span
|
|
84
|
+
assert "status_code" in span
|
|
85
|
+
assert "attributes" in span
|
|
86
|
+
|
|
87
|
+
# Check attributes
|
|
88
|
+
attrs = span["attributes"]
|
|
89
|
+
assert attrs["task_run_id"] == trace_id
|
|
90
|
+
assert attrs["category"] == "mcp"
|
|
91
|
+
|
|
92
|
+
@pytest.mark.asyncio
|
|
93
|
+
async def test_call_tool_records_error_span(self):
|
|
94
|
+
"""Test that failed call_tool records error span."""
|
|
95
|
+
uploaded_spans: list[dict[str, Any]] = []
|
|
96
|
+
|
|
97
|
+
def capture_upload(
|
|
98
|
+
task_run_id: str,
|
|
99
|
+
spans: list[dict[str, Any]],
|
|
100
|
+
telemetry_url: str,
|
|
101
|
+
api_key: str,
|
|
102
|
+
) -> bool:
|
|
103
|
+
uploaded_spans.extend(spans)
|
|
104
|
+
return True
|
|
105
|
+
|
|
106
|
+
env = Environment("test-env")
|
|
107
|
+
|
|
108
|
+
@env.tool
|
|
109
|
+
async def failing_tool() -> str:
|
|
110
|
+
"""Always fails."""
|
|
111
|
+
raise ValueError("Tool error")
|
|
112
|
+
|
|
113
|
+
task = Task(env=env)
|
|
114
|
+
|
|
115
|
+
with (
|
|
116
|
+
patch("hud.settings.settings") as mock_settings,
|
|
117
|
+
patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
|
|
118
|
+
patch("hud.eval.context.make_request"),
|
|
119
|
+
):
|
|
120
|
+
mock_settings.api_key = "test-key"
|
|
121
|
+
mock_settings.telemetry_enabled = True
|
|
122
|
+
mock_settings.hud_telemetry_url = "https://api.hud.ai"
|
|
123
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
124
|
+
|
|
125
|
+
async with hud.eval(task) as ctx:
|
|
126
|
+
# Tool errors are wrapped in ToolError
|
|
127
|
+
with pytest.raises(Exception, match="Tool error"):
|
|
128
|
+
await ctx.call_tool("failing_tool")
|
|
129
|
+
|
|
130
|
+
await asyncio.sleep(0.2)
|
|
131
|
+
|
|
132
|
+
# Should have recorded span with ERROR status
|
|
133
|
+
assert len(uploaded_spans) >= 1
|
|
134
|
+
span = uploaded_spans[0]
|
|
135
|
+
assert span["status_code"] == "ERROR"
|
|
136
|
+
# Error message contains the original error
|
|
137
|
+
assert "Tool error" in (span.get("status_message") or "")
|
|
138
|
+
|
|
139
|
+
@pytest.mark.asyncio
|
|
140
|
+
async def test_multiple_call_tools_record_spans(self):
|
|
141
|
+
"""Test that multiple call_tool calls each record a span."""
|
|
142
|
+
uploaded_spans: list[dict[str, Any]] = []
|
|
143
|
+
|
|
144
|
+
def capture_upload(
|
|
145
|
+
task_run_id: str,
|
|
146
|
+
spans: list[dict[str, Any]],
|
|
147
|
+
telemetry_url: str,
|
|
148
|
+
api_key: str,
|
|
149
|
+
) -> bool:
|
|
150
|
+
uploaded_spans.extend(spans)
|
|
151
|
+
return True
|
|
152
|
+
|
|
153
|
+
env = Environment("test-env")
|
|
154
|
+
|
|
155
|
+
@env.tool
|
|
156
|
+
async def add(a: int, b: int) -> int:
|
|
157
|
+
"""Add two numbers."""
|
|
158
|
+
return a + b
|
|
159
|
+
|
|
160
|
+
@env.tool
|
|
161
|
+
async def multiply(a: int, b: int) -> int:
|
|
162
|
+
"""Multiply two numbers."""
|
|
163
|
+
return a * b
|
|
164
|
+
|
|
165
|
+
task = Task(env=env)
|
|
166
|
+
|
|
167
|
+
with (
|
|
168
|
+
patch("hud.settings.settings") as mock_settings,
|
|
169
|
+
patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
|
|
170
|
+
patch("hud.eval.context.make_request"),
|
|
171
|
+
):
|
|
172
|
+
mock_settings.api_key = "test-key"
|
|
173
|
+
mock_settings.telemetry_enabled = True
|
|
174
|
+
mock_settings.hud_telemetry_url = "https://api.hud.ai"
|
|
175
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
176
|
+
|
|
177
|
+
async with hud.eval(task) as ctx:
|
|
178
|
+
r1 = await ctx.call_tool("add", a=2, b=3)
|
|
179
|
+
r2 = await ctx.call_tool("multiply", a=4, b=5)
|
|
180
|
+
# Results are MCPToolResult objects
|
|
181
|
+
assert "5" in str(r1)
|
|
182
|
+
assert "20" in str(r2)
|
|
183
|
+
|
|
184
|
+
await asyncio.sleep(0.2)
|
|
185
|
+
|
|
186
|
+
# Should have 2 spans
|
|
187
|
+
assert len(uploaded_spans) >= 2
|
|
188
|
+
|
|
189
|
+
@pytest.mark.asyncio
|
|
190
|
+
async def test_flush_called_on_context_exit(self):
|
|
191
|
+
"""Test that flush is called when context exits."""
|
|
192
|
+
env = Environment("test-env")
|
|
193
|
+
|
|
194
|
+
@env.tool
|
|
195
|
+
async def simple_tool() -> str:
|
|
196
|
+
return "done"
|
|
197
|
+
|
|
198
|
+
task = Task(env=env)
|
|
199
|
+
|
|
200
|
+
with (
|
|
201
|
+
patch("hud.eval.context.flush") as mock_flush,
|
|
202
|
+
patch("hud.settings.settings") as mock_settings,
|
|
203
|
+
patch("hud.eval.context.make_request"),
|
|
204
|
+
):
|
|
205
|
+
mock_settings.api_key = "test-key"
|
|
206
|
+
mock_settings.telemetry_enabled = True
|
|
207
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
208
|
+
|
|
209
|
+
async with hud.eval(task) as ctx:
|
|
210
|
+
await ctx.call_tool("simple_tool")
|
|
211
|
+
trace_id = ctx.trace_id
|
|
212
|
+
|
|
213
|
+
# Verify flush was called with the trace_id
|
|
214
|
+
mock_flush.assert_called_once_with(trace_id)
|
|
215
|
+
|
|
216
|
+
@pytest.mark.asyncio
|
|
217
|
+
async def test_telemetry_disabled_no_upload(self):
|
|
218
|
+
"""Test that no upload happens when telemetry is disabled."""
|
|
219
|
+
upload_called = False
|
|
220
|
+
|
|
221
|
+
def should_not_be_called(*args: Any, **kwargs: Any) -> bool:
|
|
222
|
+
nonlocal upload_called
|
|
223
|
+
upload_called = True
|
|
224
|
+
return True
|
|
225
|
+
|
|
226
|
+
env = Environment("test-env")
|
|
227
|
+
|
|
228
|
+
@env.tool
|
|
229
|
+
async def test_tool() -> str:
|
|
230
|
+
return "ok"
|
|
231
|
+
|
|
232
|
+
task = Task(env=env)
|
|
233
|
+
|
|
234
|
+
with (
|
|
235
|
+
patch("hud.settings.settings") as mock_settings,
|
|
236
|
+
patch("hud.telemetry.exporter._do_upload", side_effect=should_not_be_called),
|
|
237
|
+
patch("hud.eval.context.make_request"),
|
|
238
|
+
):
|
|
239
|
+
mock_settings.api_key = "test-key"
|
|
240
|
+
mock_settings.telemetry_enabled = False # Disabled!
|
|
241
|
+
mock_settings.hud_telemetry_url = "https://api.hud.ai"
|
|
242
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
243
|
+
|
|
244
|
+
async with hud.eval(task) as ctx:
|
|
245
|
+
await ctx.call_tool("test_tool")
|
|
246
|
+
|
|
247
|
+
await asyncio.sleep(0.1)
|
|
248
|
+
|
|
249
|
+
assert upload_called is False
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class TestSpanFormat:
|
|
253
|
+
"""Tests for the format of recorded spans."""
|
|
254
|
+
|
|
255
|
+
@pytest.mark.asyncio
|
|
256
|
+
async def test_span_has_required_fields(self):
|
|
257
|
+
"""Test that spans have all required HudSpan fields."""
|
|
258
|
+
uploaded_spans: list[dict[str, Any]] = []
|
|
259
|
+
|
|
260
|
+
def capture_upload(
|
|
261
|
+
task_run_id: str,
|
|
262
|
+
spans: list[dict[str, Any]],
|
|
263
|
+
telemetry_url: str,
|
|
264
|
+
api_key: str,
|
|
265
|
+
) -> bool:
|
|
266
|
+
uploaded_spans.extend(spans)
|
|
267
|
+
return True
|
|
268
|
+
|
|
269
|
+
env = Environment("test-env")
|
|
270
|
+
|
|
271
|
+
@env.tool
|
|
272
|
+
async def echo(message: str) -> str:
|
|
273
|
+
return message
|
|
274
|
+
|
|
275
|
+
task = Task(env=env)
|
|
276
|
+
|
|
277
|
+
with (
|
|
278
|
+
patch("hud.settings.settings") as mock_settings,
|
|
279
|
+
patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
|
|
280
|
+
patch("hud.eval.context.make_request"),
|
|
281
|
+
):
|
|
282
|
+
mock_settings.api_key = "test-key"
|
|
283
|
+
mock_settings.telemetry_enabled = True
|
|
284
|
+
mock_settings.hud_telemetry_url = "https://api.hud.ai"
|
|
285
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
286
|
+
|
|
287
|
+
async with hud.eval(task) as ctx:
|
|
288
|
+
await ctx.call_tool("echo", message="test")
|
|
289
|
+
|
|
290
|
+
await asyncio.sleep(0.2)
|
|
291
|
+
|
|
292
|
+
assert len(uploaded_spans) >= 1
|
|
293
|
+
span = uploaded_spans[0]
|
|
294
|
+
|
|
295
|
+
# Required fields from HudSpan
|
|
296
|
+
assert "name" in span
|
|
297
|
+
assert "trace_id" in span
|
|
298
|
+
assert len(span["trace_id"]) == 32 # 32-char hex
|
|
299
|
+
assert "span_id" in span
|
|
300
|
+
assert len(span["span_id"]) == 16 # 16-char hex
|
|
301
|
+
assert "start_time" in span
|
|
302
|
+
assert "end_time" in span
|
|
303
|
+
assert "status_code" in span
|
|
304
|
+
assert span["status_code"] in ("OK", "ERROR", "UNSET")
|
|
305
|
+
|
|
306
|
+
# Attributes
|
|
307
|
+
assert "attributes" in span
|
|
308
|
+
attrs = span["attributes"]
|
|
309
|
+
assert "task_run_id" in attrs
|
|
310
|
+
assert "category" in attrs
|
|
311
|
+
|
|
312
|
+
@pytest.mark.asyncio
|
|
313
|
+
async def test_span_timestamps_are_iso(self):
|
|
314
|
+
"""Test that span timestamps are in ISO format."""
|
|
315
|
+
uploaded_spans: list[dict[str, Any]] = []
|
|
316
|
+
|
|
317
|
+
def capture_upload(
|
|
318
|
+
task_run_id: str,
|
|
319
|
+
spans: list[dict[str, Any]],
|
|
320
|
+
telemetry_url: str,
|
|
321
|
+
api_key: str,
|
|
322
|
+
) -> bool:
|
|
323
|
+
uploaded_spans.extend(spans)
|
|
324
|
+
return True
|
|
325
|
+
|
|
326
|
+
env = Environment("test-env")
|
|
327
|
+
|
|
328
|
+
@env.tool
|
|
329
|
+
async def noop() -> None:
|
|
330
|
+
pass
|
|
331
|
+
|
|
332
|
+
task = Task(env=env)
|
|
333
|
+
|
|
334
|
+
with (
|
|
335
|
+
patch("hud.settings.settings") as mock_settings,
|
|
336
|
+
patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
|
|
337
|
+
patch("hud.eval.context.make_request"),
|
|
338
|
+
):
|
|
339
|
+
mock_settings.api_key = "test-key"
|
|
340
|
+
mock_settings.telemetry_enabled = True
|
|
341
|
+
mock_settings.hud_telemetry_url = "https://api.hud.ai"
|
|
342
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
343
|
+
|
|
344
|
+
async with hud.eval(task) as ctx:
|
|
345
|
+
await ctx.call_tool("noop")
|
|
346
|
+
|
|
347
|
+
await asyncio.sleep(0.2)
|
|
348
|
+
|
|
349
|
+
span = uploaded_spans[0]
|
|
350
|
+
|
|
351
|
+
# ISO format: YYYY-MM-DDTHH:MM:SS.ssssssZ
|
|
352
|
+
import re
|
|
353
|
+
|
|
354
|
+
iso_pattern = r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}"
|
|
355
|
+
assert re.match(iso_pattern, span["start_time"])
|
|
356
|
+
assert re.match(iso_pattern, span["end_time"])
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""Tests for telemetry exporter with mock backend."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import Any
|
|
7
|
+
from unittest.mock import patch
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
|
|
11
|
+
from hud.telemetry.exporter import (
|
|
12
|
+
_do_upload,
|
|
13
|
+
_pending_futures,
|
|
14
|
+
_pending_spans,
|
|
15
|
+
flush,
|
|
16
|
+
queue_span,
|
|
17
|
+
shutdown,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.fixture(autouse=True)
|
|
22
|
+
def clear_pending_state():
|
|
23
|
+
"""Clear pending spans and futures before and after each test."""
|
|
24
|
+
_pending_spans.clear()
|
|
25
|
+
_pending_futures.clear()
|
|
26
|
+
yield
|
|
27
|
+
_pending_spans.clear()
|
|
28
|
+
_pending_futures.clear()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TestDoUpload:
|
|
32
|
+
"""Tests for _do_upload function."""
|
|
33
|
+
|
|
34
|
+
def test_upload_success(self):
|
|
35
|
+
"""Test successful upload."""
|
|
36
|
+
with patch("hud.telemetry.exporter.make_request_sync") as mock_request:
|
|
37
|
+
result = _do_upload(
|
|
38
|
+
task_run_id="test-task-123",
|
|
39
|
+
spans=[{"name": "test.span", "attributes": {"task_run_id": "test-task-123"}}],
|
|
40
|
+
telemetry_url="https://api.hud.ai",
|
|
41
|
+
api_key="test-key",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
assert result is True
|
|
45
|
+
mock_request.assert_called_once()
|
|
46
|
+
call_kwargs = mock_request.call_args.kwargs
|
|
47
|
+
assert call_kwargs["method"] == "POST"
|
|
48
|
+
assert "test-task-123" in call_kwargs["url"]
|
|
49
|
+
assert call_kwargs["api_key"] == "test-key"
|
|
50
|
+
assert "telemetry" in call_kwargs["json"]
|
|
51
|
+
|
|
52
|
+
def test_upload_failure(self):
|
|
53
|
+
"""Test upload failure handling."""
|
|
54
|
+
with patch("hud.telemetry.exporter.make_request_sync") as mock_request:
|
|
55
|
+
mock_request.side_effect = Exception("Network error")
|
|
56
|
+
|
|
57
|
+
result = _do_upload(
|
|
58
|
+
task_run_id="test-task-123",
|
|
59
|
+
spans=[{"name": "test.span"}],
|
|
60
|
+
telemetry_url="https://api.hud.ai",
|
|
61
|
+
api_key="test-key",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
assert result is False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class TestQueueSpan:
|
|
68
|
+
"""Tests for queue_span function."""
|
|
69
|
+
|
|
70
|
+
def test_queue_span_without_api_key(self):
|
|
71
|
+
"""Test that spans are not queued without API key."""
|
|
72
|
+
with patch("hud.settings.settings") as mock_settings:
|
|
73
|
+
mock_settings.api_key = None
|
|
74
|
+
mock_settings.telemetry_enabled = True
|
|
75
|
+
|
|
76
|
+
queue_span({"name": "test", "attributes": {"task_run_id": "123"}})
|
|
77
|
+
|
|
78
|
+
assert len(_pending_spans) == 0
|
|
79
|
+
|
|
80
|
+
def test_queue_span_without_telemetry_enabled(self):
|
|
81
|
+
"""Test that spans are not queued when telemetry disabled."""
|
|
82
|
+
with patch("hud.settings.settings") as mock_settings:
|
|
83
|
+
mock_settings.api_key = "test-key"
|
|
84
|
+
mock_settings.telemetry_enabled = False
|
|
85
|
+
|
|
86
|
+
queue_span({"name": "test", "attributes": {"task_run_id": "123"}})
|
|
87
|
+
|
|
88
|
+
assert len(_pending_spans) == 0
|
|
89
|
+
|
|
90
|
+
def test_queue_span_without_task_run_id(self):
|
|
91
|
+
"""Test that spans without task_run_id are ignored."""
|
|
92
|
+
with patch("hud.settings.settings") as mock_settings:
|
|
93
|
+
mock_settings.api_key = "test-key"
|
|
94
|
+
mock_settings.telemetry_enabled = True
|
|
95
|
+
|
|
96
|
+
queue_span({"name": "test", "attributes": {}})
|
|
97
|
+
|
|
98
|
+
assert len(_pending_spans) == 0
|
|
99
|
+
|
|
100
|
+
def test_queue_span_adds_to_pending(self):
|
|
101
|
+
"""Test that spans are added to pending list."""
|
|
102
|
+
# Don't mock _do_upload so spans stay in pending
|
|
103
|
+
with patch("hud.settings.settings") as mock_settings:
|
|
104
|
+
mock_settings.api_key = "test-key"
|
|
105
|
+
mock_settings.telemetry_enabled = True
|
|
106
|
+
mock_settings.hud_telemetry_url = "https://api.hud.ai"
|
|
107
|
+
|
|
108
|
+
# Use a sync context (no event loop) so upload happens sync
|
|
109
|
+
# But we'll make it fail so span stays in pending
|
|
110
|
+
with patch("hud.telemetry.exporter._do_upload", return_value=False):
|
|
111
|
+
span = {"name": "test", "attributes": {"task_run_id": "task-123"}}
|
|
112
|
+
queue_span(span)
|
|
113
|
+
|
|
114
|
+
# Span should be in pending (upload failed so not removed)
|
|
115
|
+
assert "task-123" in _pending_spans
|
|
116
|
+
assert span in _pending_spans["task-123"]
|
|
117
|
+
|
|
118
|
+
@pytest.mark.asyncio
|
|
119
|
+
async def test_queue_span_uploads_async(self):
|
|
120
|
+
"""Test that spans are uploaded via thread pool in async context."""
|
|
121
|
+
uploaded_spans: list[dict[str, Any]] = []
|
|
122
|
+
|
|
123
|
+
def mock_upload(
|
|
124
|
+
task_run_id: str,
|
|
125
|
+
spans: list[dict[str, Any]],
|
|
126
|
+
telemetry_url: str,
|
|
127
|
+
api_key: str,
|
|
128
|
+
) -> bool:
|
|
129
|
+
uploaded_spans.extend(spans)
|
|
130
|
+
return True
|
|
131
|
+
|
|
132
|
+
with (
|
|
133
|
+
patch("hud.settings.settings") as mock_settings,
|
|
134
|
+
patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
|
|
135
|
+
):
|
|
136
|
+
mock_settings.api_key = "test-key"
|
|
137
|
+
mock_settings.telemetry_enabled = True
|
|
138
|
+
mock_settings.hud_telemetry_url = "https://api.hud.ai"
|
|
139
|
+
|
|
140
|
+
span = {"name": "test.async", "attributes": {"task_run_id": "async-task"}}
|
|
141
|
+
queue_span(span)
|
|
142
|
+
|
|
143
|
+
# Wait for thread pool to complete
|
|
144
|
+
await asyncio.sleep(0.1)
|
|
145
|
+
|
|
146
|
+
assert len(uploaded_spans) == 1
|
|
147
|
+
assert uploaded_spans[0]["name"] == "test.async"
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class TestFlush:
|
|
151
|
+
"""Tests for flush function."""
|
|
152
|
+
|
|
153
|
+
def test_flush_specific_task(self):
|
|
154
|
+
"""Test flushing spans for specific task."""
|
|
155
|
+
uploaded: list[tuple[str, list[dict[str, Any]]]] = []
|
|
156
|
+
|
|
157
|
+
def mock_upload(
|
|
158
|
+
task_run_id: str,
|
|
159
|
+
spans: list[dict[str, Any]],
|
|
160
|
+
telemetry_url: str,
|
|
161
|
+
api_key: str,
|
|
162
|
+
) -> bool:
|
|
163
|
+
uploaded.append((task_run_id, spans))
|
|
164
|
+
return True
|
|
165
|
+
|
|
166
|
+
with (
|
|
167
|
+
patch("hud.settings.settings") as mock_settings,
|
|
168
|
+
patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
|
|
169
|
+
):
|
|
170
|
+
mock_settings.api_key = "test-key"
|
|
171
|
+
mock_settings.telemetry_enabled = True
|
|
172
|
+
mock_settings.hud_telemetry_url = "https://api.hud.ai"
|
|
173
|
+
|
|
174
|
+
# Add spans for two tasks
|
|
175
|
+
_pending_spans["task-1"].append({"name": "span1"})
|
|
176
|
+
_pending_spans["task-2"].append({"name": "span2"})
|
|
177
|
+
|
|
178
|
+
# Flush only task-1
|
|
179
|
+
flush("task-1")
|
|
180
|
+
|
|
181
|
+
assert len(uploaded) == 1
|
|
182
|
+
assert uploaded[0][0] == "task-1"
|
|
183
|
+
assert "task-1" not in _pending_spans
|
|
184
|
+
assert "task-2" in _pending_spans
|
|
185
|
+
|
|
186
|
+
def test_flush_all_tasks(self):
|
|
187
|
+
"""Test flushing all pending spans."""
|
|
188
|
+
uploaded: list[tuple[str, list[dict[str, Any]]]] = []
|
|
189
|
+
|
|
190
|
+
def mock_upload(
|
|
191
|
+
task_run_id: str,
|
|
192
|
+
spans: list[dict[str, Any]],
|
|
193
|
+
telemetry_url: str,
|
|
194
|
+
api_key: str,
|
|
195
|
+
) -> bool:
|
|
196
|
+
uploaded.append((task_run_id, spans))
|
|
197
|
+
return True
|
|
198
|
+
|
|
199
|
+
with (
|
|
200
|
+
patch("hud.settings.settings") as mock_settings,
|
|
201
|
+
patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
|
|
202
|
+
):
|
|
203
|
+
mock_settings.api_key = "test-key"
|
|
204
|
+
mock_settings.telemetry_enabled = True
|
|
205
|
+
mock_settings.hud_telemetry_url = "https://api.hud.ai"
|
|
206
|
+
|
|
207
|
+
_pending_spans["task-1"].append({"name": "span1"})
|
|
208
|
+
_pending_spans["task-2"].append({"name": "span2"})
|
|
209
|
+
|
|
210
|
+
flush()
|
|
211
|
+
|
|
212
|
+
assert len(uploaded) == 2
|
|
213
|
+
assert len(_pending_spans) == 0
|
|
214
|
+
|
|
215
|
+
def test_flush_clears_without_api_key(self):
|
|
216
|
+
"""Test that flush clears spans when no API key."""
|
|
217
|
+
with patch("hud.settings.settings") as mock_settings:
|
|
218
|
+
mock_settings.api_key = None
|
|
219
|
+
mock_settings.telemetry_enabled = True
|
|
220
|
+
|
|
221
|
+
_pending_spans["task-1"].append({"name": "span1"})
|
|
222
|
+
|
|
223
|
+
flush()
|
|
224
|
+
|
|
225
|
+
assert len(_pending_spans) == 0
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class TestShutdown:
|
|
229
|
+
"""Tests for shutdown function."""
|
|
230
|
+
|
|
231
|
+
def test_shutdown_flushes_pending(self):
|
|
232
|
+
"""Test that shutdown flushes pending spans."""
|
|
233
|
+
uploaded: list[str] = []
|
|
234
|
+
|
|
235
|
+
def mock_upload(
|
|
236
|
+
task_run_id: str,
|
|
237
|
+
spans: list[dict[str, Any]],
|
|
238
|
+
telemetry_url: str,
|
|
239
|
+
api_key: str,
|
|
240
|
+
) -> bool:
|
|
241
|
+
uploaded.append(task_run_id)
|
|
242
|
+
return True
|
|
243
|
+
|
|
244
|
+
with (
|
|
245
|
+
patch("hud.settings.settings") as mock_settings,
|
|
246
|
+
patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
|
|
247
|
+
patch("hud.telemetry.exporter._get_api_key", return_value="test-key"),
|
|
248
|
+
):
|
|
249
|
+
mock_settings.api_key = "test-key"
|
|
250
|
+
mock_settings.telemetry_enabled = True
|
|
251
|
+
mock_settings.hud_telemetry_url = "https://api.hud.ai"
|
|
252
|
+
|
|
253
|
+
_pending_spans["shutdown-task"].append({"name": "final-span"})
|
|
254
|
+
|
|
255
|
+
result = shutdown(timeout=1.0)
|
|
256
|
+
|
|
257
|
+
assert result is True
|
|
258
|
+
assert "shutdown-task" in uploaded
|