hud-python 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +7 -4
- hud/adapters/common/adapter.py +14 -3
- hud/adapters/common/tests/test_adapter.py +16 -4
- hud/datasets.py +188 -0
- hud/env/docker_client.py +14 -2
- hud/env/local_docker_client.py +28 -6
- hud/gym.py +0 -9
- hud/{mcp_agent → mcp}/__init__.py +2 -0
- hud/mcp/base.py +631 -0
- hud/{mcp_agent → mcp}/claude.py +52 -47
- hud/mcp/client.py +312 -0
- hud/{mcp_agent → mcp}/langchain.py +52 -33
- hud/{mcp_agent → mcp}/openai.py +56 -40
- hud/{mcp_agent → mcp}/tests/test_base.py +129 -54
- hud/mcp/tests/test_claude.py +294 -0
- hud/mcp/tests/test_client.py +324 -0
- hud/mcp/tests/test_openai.py +238 -0
- hud/settings.py +6 -0
- hud/task.py +2 -88
- hud/taskset.py +2 -23
- hud/telemetry/__init__.py +5 -0
- hud/telemetry/_trace.py +180 -17
- hud/telemetry/context.py +79 -0
- hud/telemetry/exporter.py +165 -6
- hud/telemetry/job.py +141 -0
- hud/telemetry/tests/test_trace.py +36 -25
- hud/tools/__init__.py +14 -1
- hud/tools/computer/hud.py +13 -0
- hud/tools/executors/__init__.py +19 -2
- hud/tools/executors/pyautogui.py +84 -50
- hud/tools/executors/tests/test_pyautogui_executor.py +4 -1
- hud/tools/playwright_tool.py +73 -67
- hud/tools/tests/test_edit.py +8 -1
- hud/tools/tests/test_tools.py +3 -0
- hud/trajectory.py +5 -1
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/METADATA +20 -14
- {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/RECORD +42 -47
- hud/evaluators/__init__.py +0 -9
- hud/evaluators/base.py +0 -32
- hud/evaluators/inspect.py +0 -24
- hud/evaluators/judge.py +0 -189
- hud/evaluators/match.py +0 -156
- hud/evaluators/remote.py +0 -65
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +0 -12
- hud/evaluators/tests/test_judge.py +0 -231
- hud/evaluators/tests/test_match.py +0 -115
- hud/evaluators/tests/test_remote.py +0 -98
- hud/mcp_agent/base.py +0 -723
- /hud/{mcp_agent → mcp}/tests/__init__.py +0 -0
- {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/WHEEL +0 -0
- {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/licenses/LICENSE +0 -0
hud/telemetry/_trace.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
# ruff: noqa: T201
|
|
3
4
|
import asyncio
|
|
4
5
|
import logging
|
|
5
6
|
import time
|
|
@@ -13,14 +14,12 @@ from typing import (
|
|
|
13
14
|
TypeVar,
|
|
14
15
|
)
|
|
15
16
|
|
|
16
|
-
from hud.telemetry import exporter
|
|
17
17
|
from hud.telemetry.context import (
|
|
18
18
|
flush_buffer,
|
|
19
19
|
get_current_task_run_id,
|
|
20
20
|
is_root_trace,
|
|
21
21
|
set_current_task_run_id,
|
|
22
22
|
)
|
|
23
|
-
from hud.telemetry.exporter import submit_to_worker_loop
|
|
24
23
|
from hud.telemetry.instrumentation.registry import registry
|
|
25
24
|
|
|
26
25
|
if TYPE_CHECKING:
|
|
@@ -54,9 +53,104 @@ def _ensure_telemetry_initialized() -> None:
|
|
|
54
53
|
init_telemetry()
|
|
55
54
|
|
|
56
55
|
|
|
56
|
+
def _detect_agent_model() -> str | None:
|
|
57
|
+
"""
|
|
58
|
+
Try to auto-detect agent model from parent frames.
|
|
59
|
+
This is a best-effort approach and may not work in all cases.
|
|
60
|
+
"""
|
|
61
|
+
import sys
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
# Try different frame depths (2-3 typically covers most cases)
|
|
65
|
+
for depth in range(2, 3):
|
|
66
|
+
try:
|
|
67
|
+
frame = sys._getframe(depth)
|
|
68
|
+
# Check local variables for agent objects
|
|
69
|
+
for var_value in frame.f_locals.values():
|
|
70
|
+
# Look for objects with model_name attribute
|
|
71
|
+
if hasattr(var_value, "model_name") and hasattr(var_value, "run"):
|
|
72
|
+
# Likely an agent object
|
|
73
|
+
model_name = getattr(var_value, "model_name", None)
|
|
74
|
+
if model_name:
|
|
75
|
+
logger.debug(
|
|
76
|
+
"Found agent with model_name in frame %d: %s", depth, model_name
|
|
77
|
+
)
|
|
78
|
+
return str(model_name)
|
|
79
|
+
|
|
80
|
+
# Also check self in case we're in a method
|
|
81
|
+
if "self" in frame.f_locals:
|
|
82
|
+
self_obj = frame.f_locals["self"]
|
|
83
|
+
if hasattr(self_obj, "model_name"):
|
|
84
|
+
model_name = getattr(self_obj, "model_name", None)
|
|
85
|
+
if model_name:
|
|
86
|
+
logger.debug(
|
|
87
|
+
"Found agent model_name in self at frame %d: %s", depth, model_name
|
|
88
|
+
)
|
|
89
|
+
return str(model_name)
|
|
90
|
+
|
|
91
|
+
except (ValueError, AttributeError):
|
|
92
|
+
# Frame doesn't exist at this depth or other issues
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.debug("Agent model detection failed: %s", e)
|
|
97
|
+
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _print_trace_url(task_run_id: str) -> None:
|
|
102
|
+
"""Print the trace URL in a colorful box."""
|
|
103
|
+
url = f"https://app.hud.so/trace/{task_run_id}"
|
|
104
|
+
header = "🚀 See your agent live at:"
|
|
105
|
+
|
|
106
|
+
# ANSI color codes
|
|
107
|
+
DIM = "\033[90m" # Dim/Gray for border (visible on both light and dark terminals)
|
|
108
|
+
GOLD = "\033[33m" # Gold/Yellow for URL
|
|
109
|
+
RESET = "\033[0m"
|
|
110
|
+
BOLD = "\033[1m"
|
|
111
|
+
|
|
112
|
+
# Calculate box width based on the longest line
|
|
113
|
+
box_width = max(len(url), len(header)) + 6
|
|
114
|
+
|
|
115
|
+
# Box drawing characters
|
|
116
|
+
top_border = "╔" + "═" * (box_width - 2) + "╗"
|
|
117
|
+
bottom_border = "╚" + "═" * (box_width - 2) + "╝"
|
|
118
|
+
divider = "╟" + "─" * (box_width - 2) + "╢"
|
|
119
|
+
|
|
120
|
+
# Center the content
|
|
121
|
+
header_padding = (box_width - len(header) - 2) // 2
|
|
122
|
+
url_padding = (box_width - len(url) - 2) // 2
|
|
123
|
+
|
|
124
|
+
# Print the box
|
|
125
|
+
print(f"\n{DIM}{top_border}{RESET}")
|
|
126
|
+
print(
|
|
127
|
+
f"{DIM}║{RESET}{' ' * header_padding}{header}{' ' * (box_width - len(header) - header_padding - 3)}{DIM}║{RESET}" # noqa: E501
|
|
128
|
+
)
|
|
129
|
+
print(f"{DIM}{divider}{RESET}")
|
|
130
|
+
print(
|
|
131
|
+
f"{DIM}║{RESET}{' ' * url_padding}{BOLD}{GOLD}{url}{RESET}{' ' * (box_width - len(url) - url_padding - 2)}{DIM}║{RESET}" # noqa: E501
|
|
132
|
+
)
|
|
133
|
+
print(f"{DIM}{bottom_border}{RESET}\n")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _print_trace_complete_url(task_run_id: str) -> None:
|
|
137
|
+
"""Print the trace completion URL in a simple colorful format."""
|
|
138
|
+
url = f"https://app.hud.so/trace/{task_run_id}"
|
|
139
|
+
|
|
140
|
+
# ANSI color codes
|
|
141
|
+
GREEN = "\033[92m"
|
|
142
|
+
GOLD = "\033[33m"
|
|
143
|
+
RESET = "\033[0m"
|
|
144
|
+
DIM = "\033[2m"
|
|
145
|
+
BOLD = "\033[1m"
|
|
146
|
+
|
|
147
|
+
print(f"\n{GREEN}✓ Trace complete!{RESET} {DIM}View at:{RESET} {BOLD}{GOLD}{url}{RESET}\n")
|
|
148
|
+
|
|
149
|
+
|
|
57
150
|
@contextmanager
|
|
58
151
|
def trace_open(
|
|
59
152
|
name: str | None = None,
|
|
153
|
+
agent_model: str | None = None,
|
|
60
154
|
run_id: str | None = None,
|
|
61
155
|
attributes: dict[str, Any] | None = None,
|
|
62
156
|
) -> Generator[str, None, None]:
|
|
@@ -75,12 +169,16 @@ def trace_open(
|
|
|
75
169
|
|
|
76
170
|
task_run_id = run_id or str(uuid.uuid4())
|
|
77
171
|
|
|
78
|
-
|
|
172
|
+
_print_trace_url(task_run_id)
|
|
79
173
|
|
|
80
174
|
local_attributes = attributes.copy() if attributes is not None else {}
|
|
81
175
|
if name is not None:
|
|
82
176
|
local_attributes["trace_name"] = name
|
|
83
177
|
|
|
178
|
+
# Auto-detect agent if not explicitly provided
|
|
179
|
+
if agent_model is None:
|
|
180
|
+
agent_model = _detect_agent_model()
|
|
181
|
+
|
|
84
182
|
start_time = time.time()
|
|
85
183
|
logger.debug("Starting trace %s (Name: %s)", task_run_id, name if name else "Unnamed")
|
|
86
184
|
|
|
@@ -91,8 +189,39 @@ def trace_open(
|
|
|
91
189
|
is_root = previous_task_id is None
|
|
92
190
|
is_root_trace.set(is_root)
|
|
93
191
|
|
|
192
|
+
# Update status to initializing for root traces
|
|
193
|
+
if is_root:
|
|
194
|
+
from hud.telemetry.exporter import (
|
|
195
|
+
TaskRunStatus,
|
|
196
|
+
submit_to_worker_loop,
|
|
197
|
+
update_task_run_status,
|
|
198
|
+
)
|
|
199
|
+
from hud.telemetry.job import get_current_job_id
|
|
200
|
+
|
|
201
|
+
# Include metadata in the initial status update
|
|
202
|
+
initial_metadata = local_attributes.copy()
|
|
203
|
+
initial_metadata["is_root_trace"] = is_root
|
|
204
|
+
if agent_model:
|
|
205
|
+
initial_metadata["agent_model"] = agent_model
|
|
206
|
+
|
|
207
|
+
# Get job_id if we're in a job context
|
|
208
|
+
job_id = get_current_job_id()
|
|
209
|
+
|
|
210
|
+
coro = update_task_run_status(
|
|
211
|
+
task_run_id, TaskRunStatus.INITIALIZING, metadata=initial_metadata, job_id=job_id
|
|
212
|
+
)
|
|
213
|
+
submit_to_worker_loop(coro)
|
|
214
|
+
logger.debug("Updated task run %s status to INITIALIZING with metadata", task_run_id)
|
|
215
|
+
|
|
216
|
+
error_occurred = False
|
|
217
|
+
error_message = None
|
|
218
|
+
|
|
94
219
|
try:
|
|
95
220
|
yield task_run_id
|
|
221
|
+
except Exception as e:
|
|
222
|
+
error_occurred = True
|
|
223
|
+
error_message = str(e)
|
|
224
|
+
raise
|
|
96
225
|
finally:
|
|
97
226
|
end_time = time.time()
|
|
98
227
|
duration = end_time - start_time
|
|
@@ -101,32 +230,65 @@ def trace_open(
|
|
|
101
230
|
|
|
102
231
|
logger.debug("Finishing trace %s after %.2f seconds", task_run_id, duration)
|
|
103
232
|
|
|
233
|
+
# Update status for root traces
|
|
234
|
+
if is_root:
|
|
235
|
+
from hud.telemetry.exporter import (
|
|
236
|
+
TaskRunStatus,
|
|
237
|
+
submit_to_worker_loop,
|
|
238
|
+
update_task_run_status,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Include final metadata with duration
|
|
242
|
+
final_metadata = local_attributes.copy()
|
|
243
|
+
|
|
244
|
+
if error_occurred:
|
|
245
|
+
coro = update_task_run_status(
|
|
246
|
+
task_run_id, TaskRunStatus.ERROR, error_message, metadata=final_metadata
|
|
247
|
+
)
|
|
248
|
+
logger.debug("Updated task run %s status to ERROR: %s", task_run_id, error_message)
|
|
249
|
+
else:
|
|
250
|
+
coro = update_task_run_status(
|
|
251
|
+
task_run_id, TaskRunStatus.COMPLETED, metadata=final_metadata
|
|
252
|
+
)
|
|
253
|
+
logger.debug("Updated task run %s status to COMPLETED with metadata", task_run_id)
|
|
254
|
+
|
|
255
|
+
# Wait for the status update to complete
|
|
256
|
+
future = submit_to_worker_loop(coro)
|
|
257
|
+
if future:
|
|
258
|
+
try:
|
|
259
|
+
# Wait up to 5 seconds for the status update
|
|
260
|
+
import concurrent.futures
|
|
261
|
+
|
|
262
|
+
future.result(timeout=5.0)
|
|
263
|
+
logger.debug("Status update completed successfully")
|
|
264
|
+
except concurrent.futures.TimeoutError:
|
|
265
|
+
logger.warning("Timeout waiting for status update to complete")
|
|
266
|
+
except Exception as e:
|
|
267
|
+
logger.error("Error waiting for status update: %s", e)
|
|
268
|
+
|
|
269
|
+
# Export any remaining records before flushing
|
|
270
|
+
if is_root:
|
|
271
|
+
from hud.telemetry.context import export_incremental
|
|
272
|
+
|
|
273
|
+
export_incremental()
|
|
274
|
+
|
|
104
275
|
# Always flush the buffer for the current task
|
|
105
276
|
mcp_calls = flush_buffer(export=True)
|
|
106
277
|
logger.debug("Flushed %d MCP calls for trace %s", len(mcp_calls), task_run_id)
|
|
107
278
|
|
|
108
|
-
# Submit the telemetry payload to the worker queue
|
|
109
|
-
if is_root and mcp_calls:
|
|
110
|
-
coro = exporter.export_telemetry(
|
|
111
|
-
task_run_id=task_run_id,
|
|
112
|
-
trace_attributes=local_attributes,
|
|
113
|
-
mcp_calls=mcp_calls,
|
|
114
|
-
)
|
|
115
|
-
submit_to_worker_loop(coro)
|
|
116
|
-
|
|
117
279
|
# Restore previous context
|
|
118
280
|
set_current_task_run_id(previous_task_id)
|
|
119
281
|
is_root_trace.set(was_root)
|
|
120
282
|
|
|
121
283
|
# Log at the end
|
|
122
284
|
if is_root:
|
|
123
|
-
|
|
124
|
-
logger.info("View trace at %s", view_url)
|
|
285
|
+
_print_trace_complete_url(task_run_id)
|
|
125
286
|
|
|
126
287
|
|
|
127
288
|
@contextmanager
|
|
128
289
|
def trace(
|
|
129
290
|
name: str | None = None,
|
|
291
|
+
agent_model: str | None = None,
|
|
130
292
|
attributes: dict[str, Any] | None = None,
|
|
131
293
|
) -> Generator[str, None, None]:
|
|
132
294
|
"""
|
|
@@ -142,7 +304,7 @@ def trace(
|
|
|
142
304
|
Returns:
|
|
143
305
|
The generated task run ID (UUID string) used for this trace
|
|
144
306
|
"""
|
|
145
|
-
with trace_open(name=name, attributes=attributes) as task_run_id:
|
|
307
|
+
with trace_open(name=name, agent_model=agent_model, attributes=attributes) as task_run_id:
|
|
146
308
|
yield task_run_id
|
|
147
309
|
|
|
148
310
|
# Ensure telemetry is flushed synchronously
|
|
@@ -153,6 +315,7 @@ def trace(
|
|
|
153
315
|
|
|
154
316
|
def trace_decorator(
|
|
155
317
|
name: str | None = None,
|
|
318
|
+
agent_model: str | None = None,
|
|
156
319
|
attributes: dict[str, Any] | None = None,
|
|
157
320
|
) -> Any:
|
|
158
321
|
"""
|
|
@@ -167,7 +330,7 @@ def trace_decorator(
|
|
|
167
330
|
@wraps(func)
|
|
168
331
|
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
169
332
|
func_name = name or f"{func.__module__}.{func.__name__}"
|
|
170
|
-
with trace_open(name=func_name, attributes=attributes):
|
|
333
|
+
with trace_open(name=func_name, agent_model=agent_model, attributes=attributes):
|
|
171
334
|
return await func(*args, **kwargs)
|
|
172
335
|
|
|
173
336
|
return async_wrapper
|
|
@@ -176,7 +339,7 @@ def trace_decorator(
|
|
|
176
339
|
@wraps(func)
|
|
177
340
|
def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
178
341
|
func_name = name or f"{func.__module__}.{func.__name__}"
|
|
179
|
-
with trace_open(name=func_name, attributes=attributes):
|
|
342
|
+
with trace_open(name=func_name, agent_model=agent_model, attributes=attributes):
|
|
180
343
|
return func(*args, **kwargs)
|
|
181
344
|
|
|
182
345
|
return sync_wrapper
|
hud/telemetry/context.py
CHANGED
|
@@ -22,6 +22,10 @@ current_task_run_id: contextvars.ContextVar[str | None] = contextvars.ContextVar
|
|
|
22
22
|
)
|
|
23
23
|
# Global dictionary for buffering, keyed by task_run_id
|
|
24
24
|
_GLOBAL_MCP_CALL_BUFFERS: defaultdict[str, list[BaseMCPCall]] = defaultdict(list)
|
|
25
|
+
# Track the last exported index for each task_run_id
|
|
26
|
+
_GLOBAL_EXPORT_INDICES: defaultdict[str, int] = defaultdict(int)
|
|
27
|
+
# Track whether we've seen a non-init request for each task_run_id
|
|
28
|
+
_GLOBAL_HAS_NON_INIT_REQUEST: defaultdict[str, bool] = defaultdict(bool)
|
|
25
29
|
is_root_trace: contextvars.ContextVar[bool] = contextvars.ContextVar("is_root_trace", default=False)
|
|
26
30
|
|
|
27
31
|
# Maximum buffer size before automatic flush
|
|
@@ -67,6 +71,48 @@ def buffer_mcp_call(record: BaseMCPCall | dict[str, Any]) -> None:
|
|
|
67
71
|
flush_buffer(export=True)
|
|
68
72
|
|
|
69
73
|
|
|
74
|
+
def export_incremental() -> list[BaseMCPCall]:
|
|
75
|
+
"""
|
|
76
|
+
Export only new MCP calls since last export without clearing the buffer.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
The list of newly exported MCP calls
|
|
80
|
+
"""
|
|
81
|
+
task_run_id = get_current_task_run_id()
|
|
82
|
+
if not task_run_id or not is_root_trace.get():
|
|
83
|
+
return []
|
|
84
|
+
|
|
85
|
+
buffer = _GLOBAL_MCP_CALL_BUFFERS.get(task_run_id, [])
|
|
86
|
+
last_exported_idx = _GLOBAL_EXPORT_INDICES.get(task_run_id, 0)
|
|
87
|
+
|
|
88
|
+
# Get only the new records since last export
|
|
89
|
+
new_records = buffer[last_exported_idx:]
|
|
90
|
+
|
|
91
|
+
if new_records:
|
|
92
|
+
# Update the export index
|
|
93
|
+
_GLOBAL_EXPORT_INDICES[task_run_id] = len(buffer)
|
|
94
|
+
|
|
95
|
+
# Trigger export
|
|
96
|
+
from hud.telemetry import exporter
|
|
97
|
+
from hud.telemetry.exporter import submit_to_worker_loop
|
|
98
|
+
|
|
99
|
+
# Get current trace attributes if available
|
|
100
|
+
attributes = {"incremental": True}
|
|
101
|
+
|
|
102
|
+
coro = exporter.export_telemetry(
|
|
103
|
+
task_run_id=task_run_id,
|
|
104
|
+
trace_attributes=attributes,
|
|
105
|
+
mcp_calls=new_records.copy(), # Copy to avoid modification during export
|
|
106
|
+
)
|
|
107
|
+
submit_to_worker_loop(coro)
|
|
108
|
+
|
|
109
|
+
logger.debug(
|
|
110
|
+
"Incremental export: %d new MCP calls for trace %s", len(new_records), task_run_id
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return new_records
|
|
114
|
+
|
|
115
|
+
|
|
70
116
|
def flush_buffer(export: bool = False) -> list[BaseMCPCall]:
|
|
71
117
|
"""
|
|
72
118
|
Clear the MCP calls buffer and return its contents.
|
|
@@ -83,6 +129,10 @@ def flush_buffer(export: bool = False) -> list[BaseMCPCall]:
|
|
|
83
129
|
return []
|
|
84
130
|
|
|
85
131
|
buffer_for_task = _GLOBAL_MCP_CALL_BUFFERS.pop(task_run_id, [])
|
|
132
|
+
# Clean up export index when buffer is flushed
|
|
133
|
+
_GLOBAL_EXPORT_INDICES.pop(task_run_id, None)
|
|
134
|
+
# Clean up non-init request tracking
|
|
135
|
+
_GLOBAL_HAS_NON_INIT_REQUEST.pop(task_run_id, None)
|
|
86
136
|
return buffer_for_task
|
|
87
137
|
|
|
88
138
|
|
|
@@ -95,6 +145,31 @@ def create_request_record(
|
|
|
95
145
|
logger.warning("No active task_run_id, request record will not be created")
|
|
96
146
|
raise ValueError("No active task_run_id")
|
|
97
147
|
|
|
148
|
+
# Check if this is the first non-init request and update status
|
|
149
|
+
if is_root_trace.get() and not _GLOBAL_HAS_NON_INIT_REQUEST[task_run_id]:
|
|
150
|
+
# Common initialization method patterns
|
|
151
|
+
init_methods = {"initialize", "session/new", "init", "setup", "connect"}
|
|
152
|
+
method_lower = method.lower()
|
|
153
|
+
|
|
154
|
+
# Check if this is NOT an initialization method
|
|
155
|
+
if not any(init_pattern in method_lower for init_pattern in init_methods):
|
|
156
|
+
_GLOBAL_HAS_NON_INIT_REQUEST[task_run_id] = True
|
|
157
|
+
|
|
158
|
+
# Update status to running
|
|
159
|
+
from hud.telemetry.exporter import (
|
|
160
|
+
TaskRunStatus,
|
|
161
|
+
submit_to_worker_loop,
|
|
162
|
+
update_task_run_status,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
coro = update_task_run_status(task_run_id, TaskRunStatus.RUNNING)
|
|
166
|
+
submit_to_worker_loop(coro)
|
|
167
|
+
logger.debug(
|
|
168
|
+
"Updated task run %s status to RUNNING on first non-init request: %s",
|
|
169
|
+
task_run_id,
|
|
170
|
+
method,
|
|
171
|
+
)
|
|
172
|
+
|
|
98
173
|
record = MCPRequestCall(
|
|
99
174
|
task_run_id=task_run_id,
|
|
100
175
|
method=method,
|
|
@@ -128,6 +203,10 @@ def create_response_record(
|
|
|
128
203
|
)
|
|
129
204
|
|
|
130
205
|
buffer_mcp_call(record)
|
|
206
|
+
|
|
207
|
+
# Trigger incremental export when we receive a response
|
|
208
|
+
export_incremental()
|
|
209
|
+
|
|
131
210
|
return record
|
|
132
211
|
|
|
133
212
|
|
hud/telemetry/exporter.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import concurrent.futures # For run_coroutine_threadsafe return type
|
|
5
|
+
import enum
|
|
5
6
|
import json
|
|
6
7
|
import logging
|
|
7
8
|
import threading
|
|
@@ -13,6 +14,7 @@ if TYPE_CHECKING:
|
|
|
13
14
|
from collections.abc import Coroutine
|
|
14
15
|
|
|
15
16
|
import httpx
|
|
17
|
+
from pydantic import BaseModel
|
|
16
18
|
|
|
17
19
|
from hud.settings import settings
|
|
18
20
|
|
|
@@ -25,6 +27,41 @@ from hud.telemetry.mcp_models import ( # MCPResponseCall for isinstance check
|
|
|
25
27
|
|
|
26
28
|
logger = logging.getLogger("hud.telemetry")
|
|
27
29
|
|
|
30
|
+
|
|
31
|
+
# --- Task Run Status Models ---
|
|
32
|
+
class TaskRunStatus(enum.StrEnum):
|
|
33
|
+
INITIALIZING = "initializing"
|
|
34
|
+
RUNNING = "running"
|
|
35
|
+
EVALUATING = "evaluating"
|
|
36
|
+
COMPLETED = "completed"
|
|
37
|
+
ERROR = "error"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class TaskRunStatusUpdateRequest(BaseModel):
|
|
41
|
+
"""Request model for updating task run status."""
|
|
42
|
+
|
|
43
|
+
status: TaskRunStatus
|
|
44
|
+
error_message: str | None = None # Optional error message if status is ERROR
|
|
45
|
+
metadata: dict[str, Any] | None = None # Optional metadata for context
|
|
46
|
+
job_id: str | None = None # Optional parent job ID
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# --- Job Status Models ---
|
|
50
|
+
class JobStatus(enum.StrEnum):
|
|
51
|
+
RUNNING = "running"
|
|
52
|
+
COMPLETED = "completed"
|
|
53
|
+
ERROR = "error"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class JobStatusUpdateRequest(BaseModel):
|
|
57
|
+
"""Request model for updating job status."""
|
|
58
|
+
|
|
59
|
+
status: JobStatus
|
|
60
|
+
error_message: str | None = None # Optional error message if status is ERROR
|
|
61
|
+
metadata: dict[str, Any] | None = None # Optional metadata for context
|
|
62
|
+
taskset_name: str | None = None # Optional dataset/taskset name
|
|
63
|
+
|
|
64
|
+
|
|
28
65
|
# --- Worker Thread and Event Loop Management ---
|
|
29
66
|
_worker_thread: threading.Thread | None = None
|
|
30
67
|
_worker_loop: asyncio.AbstractEventLoop | None = None
|
|
@@ -38,7 +75,8 @@ _export_lock_async = asyncio.Lock() # Async lock for the async queue
|
|
|
38
75
|
_export_task_async: asyncio.Task | None = None # Async task for processing the queue
|
|
39
76
|
|
|
40
77
|
# --- Constants ---
|
|
41
|
-
EXPORT_INTERVAL = 5.0 # seconds
|
|
78
|
+
EXPORT_INTERVAL = 5.0 # seconds - delay between non-incremental exports
|
|
79
|
+
MIN_EXPORT_INTERVAL = 0.1 # seconds - minimum delay between any exports to avoid overwhelming
|
|
42
80
|
# MAX_BATCH_SIZE removed as we send one trace payload at a time
|
|
43
81
|
|
|
44
82
|
|
|
@@ -265,12 +303,19 @@ async def _process_export_queue_async() -> None:
|
|
|
265
303
|
|
|
266
304
|
if isinstance(payload_to_process, dict): # Ensure it's a dict before processing as such
|
|
267
305
|
await _export_trace_payload_async(payload_to_process)
|
|
306
|
+
|
|
307
|
+
# Apply appropriate delay based on export type
|
|
308
|
+
is_incremental = payload_to_process.get("attributes", {}).get("incremental", False)
|
|
309
|
+
if is_incremental:
|
|
310
|
+
# Small delay for incremental exports to avoid overwhelming the server
|
|
311
|
+
await asyncio.sleep(MIN_EXPORT_INTERVAL)
|
|
312
|
+
else:
|
|
313
|
+
# Longer delay for final exports
|
|
314
|
+
await asyncio.sleep(EXPORT_INTERVAL)
|
|
268
315
|
else:
|
|
269
316
|
# Should not happen if only dicts and sentinel are queued
|
|
270
317
|
logger.warning("Unexpected item in telemetry queue: %s", type(payload_to_process))
|
|
271
318
|
|
|
272
|
-
await asyncio.sleep(EXPORT_INTERVAL)
|
|
273
|
-
|
|
274
319
|
except asyncio.CancelledError:
|
|
275
320
|
logger.debug("Async telemetry export processing task cancelled.")
|
|
276
321
|
_export_task_async = None
|
|
@@ -340,6 +385,119 @@ async def send_telemetry_to_server(task_run_id: str, data: dict[str, Any]) -> No
|
|
|
340
385
|
logger.exception("Error exporting telemetry for task run %s: %s", task_run_id, e)
|
|
341
386
|
|
|
342
387
|
|
|
388
|
+
async def update_task_run_status(
|
|
389
|
+
task_run_id: str,
|
|
390
|
+
status: TaskRunStatus,
|
|
391
|
+
error_message: str | None = None,
|
|
392
|
+
metadata: dict[str, Any] | None = None,
|
|
393
|
+
job_id: str | None = None,
|
|
394
|
+
) -> None:
|
|
395
|
+
"""Update the status of a task run."""
|
|
396
|
+
if not settings.telemetry_enabled:
|
|
397
|
+
logger.debug("Status update skipped - telemetry not enabled")
|
|
398
|
+
return
|
|
399
|
+
|
|
400
|
+
status_url = f"{settings.base_url}/v2/task_runs/{task_run_id}/status"
|
|
401
|
+
|
|
402
|
+
try:
|
|
403
|
+
async with httpx.AsyncClient() as client:
|
|
404
|
+
headers = {
|
|
405
|
+
"Content-Type": "application/json",
|
|
406
|
+
"Authorization": f"Bearer {settings.api_key}",
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
request_data = TaskRunStatusUpdateRequest(
|
|
410
|
+
status=status, error_message=error_message, metadata=metadata, job_id=job_id
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
logger.debug(
|
|
414
|
+
"Updating status for task run %s to %s",
|
|
415
|
+
task_run_id,
|
|
416
|
+
status,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
response = await client.post(
|
|
420
|
+
status_url,
|
|
421
|
+
json=request_data.model_dump(exclude_none=True),
|
|
422
|
+
headers=headers,
|
|
423
|
+
timeout=10.0,
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
if response.status_code >= 200 and response.status_code < 300:
|
|
427
|
+
logger.debug(
|
|
428
|
+
"Successfully updated status for task run %s to %s",
|
|
429
|
+
task_run_id,
|
|
430
|
+
status,
|
|
431
|
+
)
|
|
432
|
+
else:
|
|
433
|
+
logger.warning(
|
|
434
|
+
"Failed to update status for task run %s: HTTP %s - %s",
|
|
435
|
+
task_run_id,
|
|
436
|
+
response.status_code,
|
|
437
|
+
response.text,
|
|
438
|
+
)
|
|
439
|
+
except Exception as e:
|
|
440
|
+
logger.exception("Error updating status for task run %s: %s", task_run_id, e)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
async def update_job_status(
|
|
444
|
+
job_id: str,
|
|
445
|
+
status: JobStatus,
|
|
446
|
+
error_message: str | None = None,
|
|
447
|
+
metadata: dict[str, Any] | None = None,
|
|
448
|
+
taskset_name: str | None = None,
|
|
449
|
+
) -> None:
|
|
450
|
+
"""Update the status of a job."""
|
|
451
|
+
if not settings.telemetry_enabled:
|
|
452
|
+
logger.debug("Job status update skipped - telemetry not enabled")
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
status_url = f"{settings.base_url}/v2/jobs/{job_id}/status"
|
|
456
|
+
|
|
457
|
+
try:
|
|
458
|
+
async with httpx.AsyncClient() as client:
|
|
459
|
+
headers = {
|
|
460
|
+
"Content-Type": "application/json",
|
|
461
|
+
"Authorization": f"Bearer {settings.api_key}",
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
request_data = JobStatusUpdateRequest(
|
|
465
|
+
status=status,
|
|
466
|
+
error_message=error_message,
|
|
467
|
+
metadata=metadata,
|
|
468
|
+
taskset_name=taskset_name,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
logger.debug(
|
|
472
|
+
"Updating status for job %s to %s",
|
|
473
|
+
job_id,
|
|
474
|
+
status,
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
response = await client.post(
|
|
478
|
+
status_url,
|
|
479
|
+
json=request_data.model_dump(exclude_none=True),
|
|
480
|
+
headers=headers,
|
|
481
|
+
timeout=10.0,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
if response.status_code >= 200 and response.status_code < 300:
|
|
485
|
+
logger.debug(
|
|
486
|
+
"Successfully updated status for job %s to %s",
|
|
487
|
+
job_id,
|
|
488
|
+
status,
|
|
489
|
+
)
|
|
490
|
+
else:
|
|
491
|
+
logger.warning(
|
|
492
|
+
"Failed to update status for job %s: HTTP %s - %s",
|
|
493
|
+
job_id,
|
|
494
|
+
response.status_code,
|
|
495
|
+
response.text,
|
|
496
|
+
)
|
|
497
|
+
except Exception as e:
|
|
498
|
+
logger.exception("Error updating status for job %s: %s", job_id, e)
|
|
499
|
+
|
|
500
|
+
|
|
343
501
|
# --- Public Shutdown Function ---
|
|
344
502
|
def flush(timeout: float = 10.0) -> None:
|
|
345
503
|
"""Flushes pending telemetry data and stops the worker thread."""
|
|
@@ -382,9 +540,10 @@ def flush(timeout: float = 10.0) -> None:
|
|
|
382
540
|
time.sleep(0.1)
|
|
383
541
|
# _export_task_async is set to None by _process_export_queue_async upon its exit.
|
|
384
542
|
if _export_task_async is not None:
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
"
|
|
543
|
+
# This is often a false positive due to race conditions during shutdown
|
|
544
|
+
logger.debug(
|
|
545
|
+
"Telemetry processing task did not clear itself after sentinel. "
|
|
546
|
+
"This is normal during shutdown."
|
|
388
547
|
)
|
|
389
548
|
else:
|
|
390
549
|
logger.debug("Telemetry processing task appears to have completed after sentinel.")
|